diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index cae620baf46c..40d4d4cfa380 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -42,7 +42,7 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Install cargo-audit - uses: taiki-e/install-action@ebb229c6baa68383264f2822689b07b4916d9177 # v2.62.36 + uses: taiki-e/install-action@41ef8c65f4034ff24ab1cc2cef52f3000bcf9523 # v2.62.40 with: tool: cargo-audit - name: Run audit check diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e9606e15c4ec..7019de0b7507 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -425,7 +425,7 @@ jobs: sudo apt-get update -qq sudo apt-get install -y -qq clang - name: Setup wasm-pack - uses: taiki-e/install-action@ebb229c6baa68383264f2822689b07b4916d9177 # v2.62.36 + uses: taiki-e/install-action@41ef8c65f4034ff24ab1cc2cef52f3000bcf9523 # v2.62.40 with: tool: wasm-pack - name: Run tests with headless mode @@ -752,7 +752,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv - uses: taiki-e/install-action@ebb229c6baa68383264f2822689b07b4916d9177 # v2.62.36 + uses: taiki-e/install-action@41ef8c65f4034ff24ab1cc2cef52f3000bcf9523 # v2.62.40 with: tool: cargo-msrv diff --git a/Cargo.lock b/Cargo.lock index e368dcf9a91e..120dc29db223 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -225,9 +225,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" +checksum = "4df8bb5b0bd64c0b9bc61317fcc480bad0f00e56d3bc32c69a4c8dada4786bae" dependencies = [ "arrow-arith", "arrow-array", @@ -249,23 +249,23 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" +checksum = "a1a640186d3bd30a24cb42264c2dafb30e236a6f50d510e56d40b708c9582491" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "num", + "num-traits", ] [[package]] name = "arrow-array" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" +checksum = "219fe420e6800979744c8393b687afb0252b3f8a89b91027d27887b72aa36d31" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -275,25 +275,28 @@ dependencies = [ "chrono-tz", "half", "hashbrown 0.16.0", - "num", + "num-complex", + "num-integer", + "num-traits", ] [[package]] name = "arrow-buffer" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" +checksum = "76885a2697a7edf6b59577f568b456afc94ce0e2edc15b784ce3685b6c3c5c27" dependencies = [ "bytes", "half", - "num", + "num-bigint", + "num-traits", ] [[package]] name = "arrow-cast" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" +checksum = "9c9ebb4c987e6b3b236fb4a14b20b34835abfdd80acead3ccf1f9bf399e1f168" dependencies = [ "arrow-array", "arrow-buffer", @@ -306,15 +309,15 @@ dependencies = [ "comfy-table", "half", "lexical-core", - "num", + "num-traits", "ryu", ] [[package]] name = "arrow-csv" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" +checksum = "92386159c8d4bce96f8bd396b0642a0d544d471bdc2ef34d631aec80db40a09c" dependencies = [ "arrow-array", "arrow-cast", @@ -327,21 +330,22 @@ dependencies = [ [[package]] name = "arrow-data" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" +checksum = "727681b95de313b600eddc2a37e736dcb21980a40f640314dcf360e2f36bc89b" dependencies = [ "arrow-buffer", "arrow-schema", "half", - "num", + "num-integer", + "num-traits", ] [[package]] name = "arrow-flight" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c8b0ba0784d56bc6266b79f5de7a24b47024e7b3a0045d2ad4df3d9b686099f" +checksum = "f70bb56412a007b0cfc116d15f24dda6adeed9611a213852a004cda20085a3b9" dependencies = [ "arrow-arith", "arrow-array", @@ -359,16 +363,17 @@ dependencies = [ "futures", "once_cell", "paste", - "prost 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-types", "tonic", + "tonic-prost", ] [[package]] name = "arrow-ipc" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" +checksum = "da9ba92e3de170295c98a84e5af22e2b037f0c7b32449445e6c493b5fca27f27" dependencies = [ "arrow-array", "arrow-buffer", @@ -382,9 +387,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" +checksum = "b969b4a421ae83828591c6bf5450bd52e6d489584142845ad6a861f42fe35df8" dependencies = [ "arrow-array", "arrow-buffer", @@ -394,19 +399,21 @@ dependencies = [ "chrono", "half", "indexmap 2.12.0", + "itoa", "lexical-core", "memchr", - "num", - "serde", + "num-traits", + "ryu", + "serde_core", "serde_json", "simdutf8", ] [[package]] name = "arrow-ord" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" +checksum = "141c05298b21d03e88062317a1f1a73f5ba7b6eb041b350015b1cd6aabc0519b" dependencies = [ "arrow-array", "arrow-buffer", @@ -417,9 +424,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d924b32e96f8bb74d94cd82bd97b313c432fcb0ea331689ef9e7c6b8be4b258" +checksum = "cfcfb2be2e9096236f449c11f425cddde18c4cc540f516d90f066f10a29ed515" dependencies = [ "arrow-array", "arrow-data", @@ -429,9 +436,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" +checksum = "c5f3c06a6abad6164508ed283c7a02151515cef3de4b4ff2cebbcaeb85533db2" dependencies = [ "arrow-array", "arrow-buffer", @@ -442,34 +449,35 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" +checksum = "9cfa7a03d1eee2a4d061476e1840ad5c9867a544ca6c4c59256496af5d0a8be5" dependencies = [ "bitflags 2.9.4", "serde", + "serde_core", "serde_json", ] [[package]] name = "arrow-select" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" +checksum = "bafa595babaad59f2455f4957d0f26448fb472722c186739f4fac0823a1bdb47" dependencies = [ "ahash 0.8.12", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", - "num", + "num-traits", ] [[package]] name = "arrow-string" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" +checksum = "32f46457dbbb99f2650ff3ac23e46a929e0ab81db809b02aa5511c258348bef2" dependencies = [ "arrow-array", "arrow-buffer", @@ -477,7 +485,7 @@ dependencies = [ "arrow-schema", "arrow-select", "memchr", - "num", + "num-traits", "regex", "regex-syntax", ] @@ -494,6 +502,22 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "astral-tokio-tar" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec179a06c1769b1e42e1e2cbe74c7dcdb3d6383c838454d063eaac5bbb7ebbe5" +dependencies = [ + "filetime", + "futures-core", + "libc", + "portable-atomic", + "rustc-hash", + "tokio", + "tokio-stream", + "xattr", +] + [[package]] name = "async-compression" version = "0.4.19" @@ -528,7 +552,29 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", +] + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.108", ] [[package]] @@ -539,7 +585,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1009,7 +1055,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1069,13 +1115,17 @@ dependencies = [ [[package]] name = "bollard" -version = "0.18.1" +version = "0.19.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97ccca1260af6a459d75994ad5acc1651bcabcbdbc41467cc9786519ab854c30" +checksum = "ec7646ee90964aa59e9f832a67182791396a19a5b1d76eb17599a8310a7e2e09" dependencies = [ + "async-stream", "base64 0.22.1", + "bitflags 2.9.4", + "bollard-buildkit-proto", "bollard-stubs", "bytes", + "chrono", "futures-core", "futures-util", "hex", @@ -1088,7 +1138,9 @@ dependencies = [ "hyper-util", "hyperlocal", "log", + "num", "pin-project-lite", + "rand 0.9.2", "rustls", "rustls-native-certs", "rustls-pemfile", @@ -1100,19 +1152,40 @@ dependencies = [ "serde_urlencoded", "thiserror", "tokio", + "tokio-stream", "tokio-util", + "tonic", "tower-service", "url", "winapi", ] +[[package]] +name = "bollard-buildkit-proto" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85a885520bf6249ab931a764ffdb87b0ceef48e6e7d807cfdb21b751e086e1ad" +dependencies = [ + "prost", + "prost-types", + "tonic", + "tonic-prost", + "ureq", +] + [[package]] name = "bollard-stubs" -version = "1.47.1-rc.27.3.1" +version = "1.49.1-rc.28.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f179cfbddb6e77a5472703d4b30436bff32929c0aa8a9008ecf23d1d3cdd0da" +checksum = "5731fe885755e92beff1950774068e0cae67ea6ec7587381536fca84f1779623" dependencies = [ + "base64 0.22.1", + "bollard-buildkit-proto", + "bytes", + "chrono", + "prost", "serde", + "serde_json", "serde_repr", "serde_with", ] @@ -1139,7 +1212,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1162,7 +1235,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1388,9 +1461,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.48" +version = "4.5.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae" +checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623" dependencies = [ "clap_builder", "clap_derive", @@ -1398,9 +1471,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.48" +version = "4.5.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9" +checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0" dependencies = [ "anstream", "anstyle", @@ -1410,14 +1483,14 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.47" +version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1589,7 +1662,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.48", + "clap 4.5.50", "criterion-plot", "futures", "is-terminal", @@ -1732,7 +1805,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1743,7 +1816,7 @@ checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ "darling_core", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1912,7 +1985,7 @@ dependencies = [ "aws-config", "aws-credential-types", "chrono", - "clap 4.5.48", + "clap 4.5.50", "ctor", "datafusion", "datafusion-common", @@ -2143,7 +2216,7 @@ dependencies = [ "mimalloc", "nix", "object_store", - "prost 0.13.5", + "prost", "rand 0.9.2", "serde_json", "tempfile", @@ -2229,7 +2302,7 @@ dependencies = [ "doc-comment", "futures", "log", - "prost 0.13.5", + "prost", "semver", "tokio", ] @@ -2257,6 +2330,7 @@ dependencies = [ "itertools 0.14.0", "log", "md-5", + "num-traits", "rand 0.9.2", "regex", "sha2", @@ -2366,7 +2440,7 @@ version = "50.3.0" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -2532,7 +2606,7 @@ dependencies = [ "object_store", "pbjson", "pretty_assertions", - "prost 0.13.5", + "prost", "serde", "serde_json", "tokio", @@ -2546,7 +2620,7 @@ dependencies = [ "datafusion-common", "doc-comment", "pbjson", - "prost 0.13.5", + "prost", "serde", ] @@ -2635,7 +2709,7 @@ dependencies = [ "bigdecimal", "bytes", "chrono", - "clap 4.5.48", + "clap 4.5.50", "datafusion", "datafusion-spark", "datafusion-substrait", @@ -2674,7 +2748,7 @@ dependencies = [ "itertools 0.14.0", "object_store", "pbjson-types", - "prost 0.13.5", + "prost", "serde_json", "substrait", "tokio", @@ -2759,7 +2833,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -2815,7 +2889,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -2853,7 +2927,7 @@ checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -3114,7 +3188,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -3157,16 +3231,16 @@ dependencies = [ name = "gen" version = "0.1.0" dependencies = [ - "pbjson-build 0.8.0", - "prost-build 0.14.1", + "pbjson-build", + "prost-build", ] [[package]] name = "gen-common" version = "0.1.0" dependencies = [ - "pbjson-build 0.8.0", - "prost-build 0.14.1", + "pbjson-build", + "prost-build", ] [[package]] @@ -3255,9 +3329,9 @@ dependencies = [ [[package]] name = "half" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e54c115d4f30f52c67202f079c5f9d8b49db4691f460fdb0b4c2e838261b2ba5" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", @@ -3506,7 +3580,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.0", + "socket2", "tokio", "tower-service", "tracing", @@ -3825,7 +3899,7 @@ checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -3968,7 +4042,7 @@ checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb" dependencies = [ "bitflags 2.9.4", "libc", - "redox_syscall 0.5.17", + "redox_syscall", ] [[package]] @@ -3979,7 +4053,7 @@ checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33" dependencies = [ "anstream", "anstyle", - "clap 4.5.48", + "clap 4.5.50", "escape8259", ] @@ -4390,16 +4464,16 @@ checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.5.17", + "redox_syscall", "smallvec", "windows-targets 0.52.6", ] [[package]] name = "parquet" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" +checksum = "7a0f31027ef1af7549f7cec603a9a21dce706d3f8d7c2060a68f43c1773be95a" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -4418,8 +4492,9 @@ dependencies = [ "half", "hashbrown 0.16.0", "lz4_flex", - "num", "num-bigint", + "num-integer", + "num-traits", "object_store", "paste", "ring", @@ -4454,7 +4529,7 @@ dependencies = [ "regex", "regex-syntax", "structmeta", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -4465,26 +4540,14 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pbjson" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7e6349fa080353f4a597daffd05cb81572a9c031a6d4fff7e504947496fcc68" +checksum = "898bac3fa00d0ba57a4e8289837e965baa2dee8c3749f3b11d45a64b4223d9c3" dependencies = [ - "base64 0.21.7", + "base64 0.22.1", "serde", ] -[[package]] -name = "pbjson-build" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" -dependencies = [ - "heck 0.5.0", - "itertools 0.13.0", - "prost 0.13.5", - "prost-types 0.13.5", -] - [[package]] name = "pbjson-build" version = "0.8.0" @@ -4493,22 +4556,22 @@ checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095" dependencies = [ "heck 0.5.0", "itertools 0.14.0", - "prost 0.14.1", - "prost-types 0.14.1", + "prost", + "prost-types", ] [[package]] name = "pbjson-types" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e54e5e7bfb1652f95bc361d76f3c780d8e526b134b85417e774166ee941f0887" +checksum = "8e748e28374f10a330ee3bb9f29b828c0ac79831a32bab65015ad9b661ead526" dependencies = [ "bytes", "chrono", "pbjson", - "pbjson-build 0.7.0", - "prost 0.13.5", - "prost-build 0.13.5", + "pbjson-build", + "prost", + "prost-build", "serde", ] @@ -4594,7 +4657,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -4667,7 +4730,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -4742,7 +4805,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -4787,16 +4850,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "prost" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" -dependencies = [ - "bytes", - "prost-derive 0.13.5", -] - [[package]] name = "prost" version = "0.14.1" @@ -4804,27 +4857,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7231bd9b3d3d33c86b58adbac74b5ec0ad9f496b19d22801d773636feaa95f3d" dependencies = [ "bytes", - "prost-derive 0.14.1", -] - -[[package]] -name = "prost-build" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" -dependencies = [ - "heck 0.5.0", - "itertools 0.14.0", - "log", - "multimap", - "once_cell", - "petgraph 0.7.1", - "prettyplease", - "prost 0.13.5", - "prost-types 0.13.5", - "regex", - "syn 2.0.106", - "tempfile", + "prost-derive", ] [[package]] @@ -4840,26 +4873,13 @@ dependencies = [ "once_cell", "petgraph 0.7.1", "prettyplease", - "prost 0.14.1", - "prost-types 0.14.1", + "prost", + "prost-types", "regex", - "syn 2.0.106", + "syn 2.0.108", "tempfile", ] -[[package]] -name = "prost-derive" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" -dependencies = [ - "anyhow", - "itertools 0.14.0", - "proc-macro2", - "quote", - "syn 2.0.106", -] - [[package]] name = "prost-derive" version = "0.14.1" @@ -4870,16 +4890,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.106", -] - -[[package]] -name = "prost-types" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" -dependencies = [ - "prost 0.13.5", + "syn 2.0.108", ] [[package]] @@ -4888,7 +4899,7 @@ version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9b4db3d6da204ed77bb26ba83b6122a73aeb2e87e25fbf7ad2e84c4ccbf8f72" dependencies = [ - "prost 0.14.1", + "prost", ] [[package]] @@ -4931,9 +4942,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a" +checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383" dependencies = [ "indoc", "libc", @@ -4948,19 +4959,18 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598" +checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f" dependencies = [ - "once_cell", "target-lexicon", ] [[package]] name = "pyo3-ffi" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c" +checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105" dependencies = [ "libc", "pyo3-build-config", @@ -4968,27 +4978,27 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50" +checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] name = "pyo3-macros-backend" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc" +checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf" dependencies = [ "heck 0.5.0", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5020,7 +5030,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.6.0", + "socket2", "thiserror", "tokio", "tracing", @@ -5057,7 +5067,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.0", + "socket2", "tracing", "windows-sys 0.60.2", ] @@ -5199,16 +5209,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.106", -] - -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", + "syn 2.0.108", ] [[package]] @@ -5248,14 +5249,14 @@ checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] name = "regex" -version = "1.11.3" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b5288124840bee7b386bc413c487869b360b2b4ec421ea56425128692f2a82c" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -5265,9 +5266,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.11" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "833eb9ce86d40ef33cb1306d8accf7bc8ec2bfea4355cbdebb3df68b40925cad" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -5431,7 +5432,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.106", + "syn 2.0.108", "unicode-ident", ] @@ -5443,7 +5444,7 @@ checksum = "b3a8fb4672e840a587a66fc577a5491375df51ddb88f2a2c2a792598c326fe14" dependencies = [ "quote", "rand 0.8.5", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5498,6 +5499,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd3c25631629d034ce7cd9940adc9d45762d46de2b0f57193c4443b92c6d4d40" dependencies = [ "aws-lc-rs", + "log", "once_cell", "ring", "rustls-pki-types", @@ -5646,7 +5648,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5737,7 +5739,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5748,7 +5750,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5772,7 +5774,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5784,7 +5786,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5828,7 +5830,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5950,16 +5952,6 @@ dependencies = [ "cmake", ] -[[package]] -name = "socket2" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - [[package]] name = "socket2" version = "0.6.0" @@ -6014,7 +6006,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6062,7 +6054,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6073,7 +6065,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6122,7 +6114,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6134,7 +6126,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6149,18 +6141,18 @@ dependencies = [ [[package]] name = "substrait" -version = "0.58.0" +version = "0.59.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de6d24c270c6c672a86c183c3a8439ba46c1936f93cf7296aa692de3b0ff0228" +checksum = "540683f325ab9ab1a2008bc24588f3e76f63b6a3f52bc47e121122376a063639" dependencies = [ "heck 0.5.0", "pbjson", - "pbjson-build 0.7.0", + "pbjson-build", "pbjson-types", "prettyplease", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "protobuf-src", "regress", "schemars 0.8.22", @@ -6168,7 +6160,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.106", + "syn 2.0.108", "typify", "walkdir", ] @@ -6192,9 +6184,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.106" +version = "2.0.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" dependencies = [ "proc-macro2", "quote", @@ -6218,7 +6210,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6273,13 +6265,13 @@ dependencies = [ [[package]] name = "testcontainers" -version = "0.24.0" +version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23bb7577dca13ad86a78e8271ef5d322f37229ec83b8d98da6d996c588a1ddb1" +checksum = "3f3ac71069f20ecfa60c396316c283fbf35e6833a53dff551a31b5458da05edc" dependencies = [ + "astral-tokio-tar", "async-trait", "bollard", - "bollard-stubs", "bytes", "docker_credential", "either", @@ -6295,16 +6287,16 @@ dependencies = [ "thiserror", "tokio", "tokio-stream", - "tokio-tar", "tokio-util", + "ulid", "url", ] [[package]] name = "testcontainers-modules" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac95cde96549fc19c6bf19ef34cc42bd56e264c1cb97e700e21555be0ecf9e2" +checksum = "1966329d5bb3f89d33602d2db2da971fb839f9297dad16527abf4564e2ae0a6d" dependencies = [ "testcontainers", ] @@ -6335,7 +6327,7 @@ checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6445,7 +6437,7 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.0", + "socket2", "tokio-macros", "windows-sys 0.61.0", ] @@ -6458,7 +6450,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6481,7 +6473,7 @@ dependencies = [ "postgres-protocol", "postgres-types", "rand 0.9.2", - "socket2 0.6.0", + "socket2", "tokio", "tokio-util", "whoami", @@ -6508,21 +6500,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-tar" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75" -dependencies = [ - "filetime", - "futures-core", - "libc", - "redox_syscall 0.3.5", - "tokio", - "tokio-stream", - "xattr", -] - [[package]] name = "tokio-util" version = "0.7.16" @@ -6568,9 +6545,9 @@ dependencies = [ [[package]] name = "tonic" -version = "0.13.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9" +checksum = "eb7613188ce9f7df5bfe185db26c5814347d110db17920415cf2fbcad85e7203" dependencies = [ "async-trait", "axum", @@ -6585,8 +6562,8 @@ dependencies = [ "hyper-util", "percent-encoding", "pin-project", - "prost 0.13.5", - "socket2 0.5.10", + "socket2", + "sync_wrapper", "tokio", "tokio-stream", "tower", @@ -6595,6 +6572,17 @@ dependencies = [ "tracing", ] +[[package]] +name = "tonic-prost" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66bd50ad6ce1252d87ef024b3d64fe4c3cf54a86fb9ef4c631fdd0ded7aeaa67" +dependencies = [ + "bytes", + "prost", + "tonic", +] + [[package]] name = "tower" version = "0.5.2" @@ -6663,7 +6651,7 @@ checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6771,7 +6759,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.106", + "syn 2.0.108", "thiserror", "unicode-ident", ] @@ -6789,10 +6777,20 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.106", + "syn 2.0.108", "typify-impl", ] +[[package]] +name = "ulid" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "470dbf6591da1b39d43c14523b2b469c86879a53e8b758c8e090a470fe7b1fbe" +dependencies = [ + "rand 0.9.2", + "web-time", +] + [[package]] name = "unicode-bidi" version = "0.3.18" @@ -6862,6 +6860,35 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "ureq" +version = "3.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99ba1025f18a4a3fc3e9b48c868e9beb4f24f4b4b1a325bada26bd4119f46537" +dependencies = [ + "base64 0.22.1", + "log", + "percent-encoding", + "rustls", + "rustls-pemfile", + "rustls-pki-types", + "ureq-proto", + "utf-8", + "webpki-roots", +] + +[[package]] +name = "ureq-proto" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b4531c118335662134346048ddb0e54cc86bd7e81866757873055f0e38f5d2" +dependencies = [ + "base64 0.22.1", + "http 1.3.1", + "httparse", + "log", +] + [[package]] name = "url" version = "2.5.7" @@ -6880,6 +6907,12 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -6985,7 +7018,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", "wasm-bindgen-shared", ] @@ -7020,7 +7053,7 @@ checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -7055,7 +7088,7 @@ checksum = "b673bca3298fe582aeef8352330ecbad91849f85090805582400850f8270a2e8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -7091,6 +7124,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-roots" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b130c0d2d49f8b6889abc456e795e82525204f27c42cf767cf0d7734e089b8" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "whoami" version = "1.6.1" @@ -7200,7 +7242,7 @@ checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -7211,7 +7253,7 @@ checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -7527,7 +7569,7 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", "synstructure", ] @@ -7548,7 +7590,7 @@ checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -7568,7 +7610,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", "synstructure", ] @@ -7608,7 +7650,7 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 3e0861c07ab0..bf0f3fa0510e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -91,19 +91,19 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.20", default-features = false } -arrow = { version = "56.2.0", features = [ +arrow = { version = "57.0.0", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "56.2.0", default-features = false } -arrow-flight = { version = "56.2.0", features = [ +arrow-buffer = { version = "57.0.0", default-features = false } +arrow-flight = { version = "57.0.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "56.2.0", default-features = false, features = [ +arrow-ipc = { version = "57.0.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "56.2.0", default-features = false } -arrow-schema = { version = "56.2.0", default-features = false } +arrow-ord = { version = "57.0.0", default-features = false } +arrow-schema = { version = "57.0.0", default-features = false } async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.10" @@ -156,29 +156,30 @@ half = { version = "2.7.0", default-features = false } hashbrown = { version = "0.14.5", features = ["raw"] } hex = { version = "0.4.3" } indexmap = "2.12.0" +insta = { version = "1.43.2", features = ["glob", "filters"] } itertools = "0.14" log = "^0.4" +num-traits = { version = "0.2" } object_store = { version = "0.12.4", default-features = false } parking_lot = "0.12" -parquet = { version = "56.2.0", default-features = false, features = [ +parquet = { version = "57.0.0", default-features = false, features = [ "arrow", "async", "object_store", ] } -pbjson = { version = "0.7.0" } -pbjson-types = "0.7" +pbjson = { version = "0.8.0" } +pbjson-types = "0.8" # Should match arrow-flight's version of prost. -insta = { version = "1.43.2", features = ["glob", "filters"] } -prost = "0.13.1" +prost = "0.14.1" rand = "0.9" recursive = "0.1.1" -regex = "1.11" +regex = "1.12" rstest = "0.25.0" serde_json = "1" sqlparser = { version = "0.59.0", default-features = false, features = ["std", "visitor"] } tempfile = "3" -testcontainers = { version = "0.24", features = ["default"] } -testcontainers-modules = { version = "0.12" } +testcontainers = { version = "0.25.2", features = ["default"] } +testcontainers-modules = { version = "0.13" } tokio = { version = "1.48", features = ["macros", "rt", "sync"] } url = "2.5.7" diff --git a/README.md b/README.md index 4c4b955176b2..5191496eaafe 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ See [use cases] for examples. The following related subprojects target end users DataFusion. "Out of the box," -DataFusion offers [SQL] and [`Dataframe`] APIs, excellent [performance], +DataFusion offers [SQL](https://datafusion.apache.org/user-guide/sql/index.html) and [Dataframe](https://datafusion.apache.org/user-guide/dataframe.html) APIs, excellent [performance], built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and a great community. diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs index 2b66de641b67..cc59b7803036 100644 --- a/benchmarks/src/tpch/run.rs +++ b/benchmarks/src/tpch/run.rs @@ -92,6 +92,15 @@ pub struct RunOpt { #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")] prefer_hash_join: BoolDefaultTrue, + /// If true then Piecewise Merge Join can be used, if false then it will opt for Nested Loop Join + /// True by default. + #[structopt( + short = "j", + long = "enable_piecewise_merge_join", + default_value = "false" + )] + enable_piecewise_merge_join: BoolDefaultTrue, + /// Mark the first column of each table as sorted in ascending order. /// The tables should have been created with the `--sort` option for this to have any effect. #[structopt(short = "t", long = "sorted")] @@ -112,6 +121,8 @@ impl RunOpt { .config()? .with_collect_statistics(!self.disable_statistics); config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join; + config.options_mut().optimizer.enable_piecewise_merge_join = + self.enable_piecewise_merge_join; let rt_builder = self.common.runtime_env_builder()?; let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?); // register tables @@ -379,6 +390,7 @@ mod tests { output_path: None, disable_statistics: false, prefer_hash_join: true, + enable_piecewise_merge_join: false, sorted: false, }; opt.register_tables(&ctx).await?; @@ -416,6 +428,7 @@ mod tests { output_path: None, disable_statistics: false, prefer_hash_join: true, + enable_piecewise_merge_join: false, sorted: false, }; opt.register_tables(&ctx).await?; diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 53744e6c609b..f3069b492352 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -40,7 +40,7 @@ async-trait = { workspace = true } aws-config = "1.8.7" aws-credential-types = "1.2.7" chrono = { workspace = true } -clap = { version = "4.5.47", features = ["cargo", "derive"] } +clap = { version = "4.5.50", features = ["cargo", "derive"] } datafusion = { workspace = true, features = [ "avro", "compression", diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs index 3ec446c51583..d23b12469e38 100644 --- a/datafusion-cli/src/functions.rs +++ b/datafusion-cli/src/functions.rs @@ -419,7 +419,9 @@ impl TableFunctionImpl for ParquetMetadataFunc { stats_max_value_arr.push(None); }; compression_arr.push(format!("{:?}", column.compression())); - encodings_arr.push(format!("{:?}", column.encodings())); + // need to collect into Vec to format + let encodings: Vec<_> = column.encodings().collect(); + encodings_arr.push(format!("{:?}", encodings)); index_page_offset_arr.push(column.index_page_offset()); dictionary_page_offset_arr.push(column.dictionary_page_offset()); data_page_offset_arr.push(column.data_page_offset()); diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index bdb2fdf5198e..09fa8ef15af8 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -497,7 +497,7 @@ mod tests { +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ | filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size | +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ - | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [RLE_DICTIONARY, PLAIN, RLE] | | 4 | 46 | 121 | 123 | + | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [PLAIN, RLE, RLE_DICTIONARY] | | 4 | 46 | 121 | 123 | +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ "#); @@ -510,7 +510,7 @@ mod tests { +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ | filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size | +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ - | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [RLE_DICTIONARY, PLAIN, RLE] | | 4 | 46 | 121 | 123 | + | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [PLAIN, RLE, RLE_DICTIONARY] | | 4 | 46 | 121 | 123 | +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ "#); @@ -532,7 +532,7 @@ mod tests { +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ | filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size | +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ - | ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | "String" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [BIT_PACKED, RLE, PLAIN] | | | 4 | 152 | 163 | + | ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | "String" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [PLAIN, RLE, BIT_PACKED] | | | 4 | 152 | 163 | +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ "#); @@ -592,9 +592,9 @@ mod tests { +-----------------------------------+-----------------+---------------------+------+------------------+ | filename | file_size_bytes | metadata_size_bytes | hits | extra | +-----------------------------------+-----------------+---------------------+------+------------------+ - | alltypes_plain.parquet | 1851 | 10181 | 2 | page_index=false | - | alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true | - | lz4_raw_compressed_larger.parquet | 380836 | 2939 | 2 | page_index=false | + | alltypes_plain.parquet | 1851 | 6957 | 2 | page_index=false | + | alltypes_tiny_pages.parquet | 454233 | 267014 | 2 | page_index=true | + | lz4_raw_compressed_larger.parquet | 380836 | 996 | 2 | page_index=false | +-----------------------------------+-----------------+---------------------+------+------------------+ "); @@ -623,9 +623,9 @@ mod tests { +-----------------------------------+-----------------+---------------------+------+------------------+ | filename | file_size_bytes | metadata_size_bytes | hits | extra | +-----------------------------------+-----------------+---------------------+------+------------------+ - | alltypes_plain.parquet | 1851 | 10181 | 5 | page_index=false | - | alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true | - | lz4_raw_compressed_larger.parquet | 380836 | 2939 | 3 | page_index=false | + | alltypes_plain.parquet | 1851 | 6957 | 5 | page_index=false | + | alltypes_tiny_pages.parquet | 454233 | 267014 | 2 | page_index=true | + | lz4_raw_compressed_larger.parquet | 380836 | 996 | 3 | page_index=false | +-----------------------------------+-----------------+---------------------+------+------------------+ "); diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 68bb5376a1ac..bb0525e57753 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -81,7 +81,7 @@ serde_json = { workspace = true } tempfile = { workspace = true } test-utils = { path = "../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } -tonic = "0.13.1" +tonic = "0.14" tracing = { version = "0.1" } tracing-subscriber = { version = "0.3" } url = { workspace = true } diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index 55400e219283..1c560be6d08a 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -502,7 +502,7 @@ impl TableProvider for IndexTableProvider { let file_scan_config = FileScanConfigBuilder::new(object_store_url, schema, file_source) .with_limit(limit) - .with_projection(projection.cloned()) + .with_projection_indices(projection.cloned()) .with_file(partitioned_file) .build(); diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/csv_json_opener.rs index 1a2c2cbff418..ef2a3eaca0c8 100644 --- a/datafusion-examples/examples/csv_json_opener.rs +++ b/datafusion-examples/examples/csv_json_opener.rs @@ -31,7 +31,9 @@ use datafusion::{ test_util::aggr_test_schema, }; -use datafusion::datasource::physical_plan::FileScanConfigBuilder; +use datafusion::datasource::{ + physical_plan::FileScanConfigBuilder, table_schema::TableSchema, +}; use futures::StreamExt; use object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore}; @@ -60,14 +62,14 @@ async fn csv_opener() -> Result<()> { Arc::clone(&schema), Arc::new(CsvSource::default()), ) - .with_projection(Some(vec![12, 0])) + .with_projection_indices(Some(vec![12, 0])) .with_limit(Some(5)) .with_file(PartitionedFile::new(path.display().to_string(), 10)) .build(); let config = CsvSource::new(true, b',', b'"') .with_comment(Some(b'#')) - .with_schema(schema) + .with_schema(TableSchema::from_file_schema(schema)) .with_batch_size(8192) .with_projection(&scan_config); @@ -126,7 +128,7 @@ async fn json_opener() -> Result<()> { schema, Arc::new(JsonSource::default()), ) - .with_projection(Some(vec![1, 0])) + .with_projection_indices(Some(vec![1, 0])) .with_limit(Some(5)) .with_file(PartitionedFile::new(path.to_string(), 10)) .build(); diff --git a/datafusion-examples/examples/default_column_values.rs b/datafusion-examples/examples/default_column_values.rs index 43e2d4ca0988..d3a7d2ec67f3 100644 --- a/datafusion-examples/examples/default_column_values.rs +++ b/datafusion-examples/examples/default_column_values.rs @@ -260,7 +260,7 @@ impl TableProvider for DefaultValueTableProvider { self.schema.clone(), Arc::new(parquet_source), ) - .with_projection(projection.cloned()) + .with_projection_indices(projection.cloned()) .with_limit(limit) .with_file_group(file_group) .with_expr_adapter(Some(Arc::new(DefaultValuePhysicalExprAdapterFactory) as _)); diff --git a/datafusion-examples/examples/flight/flight_client.rs b/datafusion-examples/examples/flight/flight_client.rs index e3237284b430..ff4b5903ad88 100644 --- a/datafusion-examples/examples/flight/flight_client.rs +++ b/datafusion-examples/examples/flight/flight_client.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; use std::sync::Arc; +use tonic::transport::Endpoint; use datafusion::arrow::datatypes::Schema; @@ -34,7 +35,9 @@ async fn main() -> Result<(), Box> { let testdata = datafusion::test_util::parquet_test_data(); // Create Flight client - let mut client = FlightServiceClient::connect("http://localhost:50051").await?; + let endpoint = Endpoint::new("http://localhost:50051")?; + let channel = endpoint.connect().await?; + let mut client = FlightServiceClient::new(channel); // Call get_schema to get the schema of a Parquet file let request = tonic::Request::new(FlightDescriptor { diff --git a/datafusion-examples/examples/flight/flight_server.rs b/datafusion-examples/examples/flight/flight_server.rs index 58bfb7a341c1..22265e415fbd 100644 --- a/datafusion-examples/examples/flight/flight_server.rs +++ b/datafusion-examples/examples/flight/flight_server.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator}; +use arrow::ipc::writer::{CompressionContext, DictionaryTracker, IpcDataGenerator}; use std::sync::Arc; use arrow_flight::{PollInfo, SchemaAsIpc}; @@ -106,6 +106,7 @@ impl FlightService for FlightServiceImpl { // add an initial FlightData message that sends schema let options = arrow::ipc::writer::IpcWriteOptions::default(); + let mut compression_context = CompressionContext::default(); let schema_flight_data = SchemaAsIpc::new(&schema, &options); let mut flights = vec![FlightData::from(schema_flight_data)]; @@ -115,7 +116,7 @@ impl FlightService for FlightServiceImpl { for batch in &results { let (flight_dictionaries, flight_batch) = encoder - .encoded_batch(batch, &mut tracker, &options) + .encode(batch, &mut tracker, &options, &mut compression_context) .map_err(|e: ArrowError| Status::internal(e.to_string()))?; flights.extend(flight_dictionaries.into_iter().map(Into::into)); diff --git a/datafusion-examples/examples/parquet_encrypted.rs b/datafusion-examples/examples/parquet_encrypted.rs index e9e239b7a1c3..690d9f2a5f14 100644 --- a/datafusion-examples/examples/parquet_encrypted.rs +++ b/datafusion-examples/examples/parquet_encrypted.rs @@ -16,12 +16,13 @@ // under the License. use datafusion::common::DataFusionError; -use datafusion::config::TableParquetOptions; +use datafusion::config::{ConfigFileEncryptionProperties, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::logical_expr::{col, lit}; use datafusion::parquet::encryption::decrypt::FileDecryptionProperties; use datafusion::parquet::encryption::encrypt::FileEncryptionProperties; use datafusion::prelude::{ParquetReadOptions, SessionContext}; +use std::sync::Arc; use tempfile::TempDir; #[tokio::main] @@ -55,7 +56,7 @@ async fn main() -> datafusion::common::Result<()> { // Write encrypted parquet let mut options = TableParquetOptions::default(); - options.crypto.file_encryption = Some((&encrypt).into()); + options.crypto.file_encryption = Some(ConfigFileEncryptionProperties::from(&encrypt)); parquet_df .write_parquet( tempfile_str.as_str(), @@ -100,7 +101,8 @@ async fn query_dataframe(df: &DataFrame) -> Result<(), DataFusionError> { // Setup encryption and decryption properties fn setup_encryption( parquet_df: &DataFrame, -) -> Result<(FileEncryptionProperties, FileDecryptionProperties), DataFusionError> { +) -> Result<(Arc, Arc), DataFusionError> +{ let schema = parquet_df.schema(); let footer_key = b"0123456789012345".to_vec(); // 128bit/16 let column_key = b"1234567890123450".to_vec(); // 128bit/16 diff --git a/datafusion-examples/examples/parquet_encrypted_with_kms.rs b/datafusion-examples/examples/parquet_encrypted_with_kms.rs index 19b0e8d0b199..45bfd183773a 100644 --- a/datafusion-examples/examples/parquet_encrypted_with_kms.rs +++ b/datafusion-examples/examples/parquet_encrypted_with_kms.rs @@ -226,7 +226,7 @@ impl EncryptionFactory for TestEncryptionFactory { options: &EncryptionFactoryOptions, schema: &SchemaRef, _file_path: &Path, - ) -> Result> { + ) -> Result>> { let config: EncryptionConfig = options.to_extension_options()?; // Generate a random encryption key for this file. @@ -268,7 +268,7 @@ impl EncryptionFactory for TestEncryptionFactory { &self, _options: &EncryptionFactoryOptions, _file_path: &Path, - ) -> Result> { + ) -> Result>> { let decryption_properties = FileDecryptionProperties::with_key_retriever(Arc::new(TestKeyRetriever {})) .build()?; diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index afc3b279f4a9..127c55da982c 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -246,7 +246,7 @@ impl TableProvider for IndexTableProvider { let source = Arc::new(ParquetSource::default().with_predicate(predicate)); let mut file_scan_config_builder = FileScanConfigBuilder::new(object_store_url, self.schema(), source) - .with_projection(projection.cloned()) + .with_projection_indices(projection.cloned()) .with_limit(limit); // Transform to the format needed to pass to DataSourceExec diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs index e9ac1bf097a2..95f9523d4401 100644 --- a/datafusion/catalog-listing/src/table.rs +++ b/datafusion/catalog-listing/src/table.rs @@ -499,7 +499,7 @@ impl TableProvider for ListingTable { .with_file_groups(partitioned_file_lists) .with_constraints(self.constraints.clone()) .with_statistics(statistics) - .with_projection(projection) + .with_projection_indices(projection) .with_limit(limit) .with_output_ordering(output_ordering) .with_table_partition_cols(table_partition_cols) diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index f5e51cb236d4..abeb4e66a269 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -71,7 +71,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.25", optional = true } +pyo3 = { version = "0.26", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true, optional = true } tokio = { workspace = true } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 271ba6ddcff5..bc321b227ee5 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -26,14 +26,15 @@ use crate::format::{ExplainAnalyzeLevel, ExplainFormat}; use crate::parsers::CompressionTypeVariant; use crate::utils::get_available_parallelism; use crate::{DataFusionError, Result}; +#[cfg(feature = "parquet_encryption")] +use hex; use std::any::Any; use std::collections::{BTreeMap, HashMap}; use std::error::Error; use std::fmt::{self, Display}; use std::str::FromStr; - #[cfg(feature = "parquet_encryption")] -use hex; +use std::sync::Arc; /// A macro that wraps a configuration struct and automatically derives /// [`Default`] and [`ConfigField`] for it, allowing it to be used @@ -56,7 +57,7 @@ use hex; /// /// Field 3 doc /// field3: Option, default = None /// } -///} +/// } /// ``` /// /// Will generate @@ -620,7 +621,10 @@ config_namespace! { /// bytes of the parquet file optimistically. If not specified, two reads are required: /// One read to fetch the 8-byte parquet footer and /// another to fetch the metadata length encoded in the footer - pub metadata_size_hint: Option, default = None + /// Default setting to 512 KiB, which should be sufficient for most parquet files, + /// it can reduce one I/O operation per parquet file. If the metadata is larger than + /// the hint, two reads will still be performed. + pub metadata_size_hint: Option, default = Some(512 * 1024) /// (reading) If true, filter expressions are be applied during the parquet decoding operation to /// reduce the number of rows decoded. This optimization is sometimes called "late materialization". @@ -1322,36 +1326,35 @@ impl ConfigOptions { /// # Example /// ``` /// use datafusion_common::{ -/// config::ConfigExtension, extensions_options, -/// config::ConfigOptions, +/// config::ConfigExtension, config::ConfigOptions, extensions_options, /// }; -/// // Define a new configuration struct using the `extensions_options` macro -/// extensions_options! { -/// /// My own config options. -/// pub struct MyConfig { -/// /// Should "foo" be replaced by "bar"? -/// pub foo_to_bar: bool, default = true +/// // Define a new configuration struct using the `extensions_options` macro +/// extensions_options! { +/// /// My own config options. +/// pub struct MyConfig { +/// /// Should "foo" be replaced by "bar"? +/// pub foo_to_bar: bool, default = true /// -/// /// How many "baz" should be created? -/// pub baz_count: usize, default = 1337 -/// } -/// } +/// /// How many "baz" should be created? +/// pub baz_count: usize, default = 1337 +/// } +/// } /// -/// impl ConfigExtension for MyConfig { +/// impl ConfigExtension for MyConfig { /// const PREFIX: &'static str = "my_config"; -/// } +/// } /// -/// // set up config struct and register extension -/// let mut config = ConfigOptions::default(); -/// config.extensions.insert(MyConfig::default()); +/// // set up config struct and register extension +/// let mut config = ConfigOptions::default(); +/// config.extensions.insert(MyConfig::default()); /// -/// // overwrite config default -/// config.set("my_config.baz_count", "42").unwrap(); +/// // overwrite config default +/// config.set("my_config.baz_count", "42").unwrap(); /// -/// // check config state -/// let my_config = config.extensions.get::().unwrap(); -/// assert!(my_config.foo_to_bar,); -/// assert_eq!(my_config.baz_count, 42,); +/// // check config state +/// let my_config = config.extensions.get::().unwrap(); +/// assert!(my_config.foo_to_bar,); +/// assert_eq!(my_config.baz_count, 42,); /// ``` /// /// # Note: @@ -2409,13 +2412,13 @@ impl From for FileEncryptionProperties { hex::decode(&val.aad_prefix_as_hex).expect("Invalid AAD prefix"); fep = fep.with_aad_prefix(aad_prefix); } - fep.build().unwrap() + Arc::unwrap_or_clone(fep.build().unwrap()) } } #[cfg(feature = "parquet_encryption")] -impl From<&FileEncryptionProperties> for ConfigFileEncryptionProperties { - fn from(f: &FileEncryptionProperties) -> Self { +impl From<&Arc> for ConfigFileEncryptionProperties { + fn from(f: &Arc) -> Self { let (column_names_vec, column_keys_vec, column_metas_vec) = f.column_keys(); let mut column_encryption_properties: HashMap< @@ -2557,13 +2560,13 @@ impl From for FileDecryptionProperties { fep = fep.with_aad_prefix(aad_prefix); } - fep.build().unwrap() + Arc::unwrap_or_clone(fep.build().unwrap()) } } #[cfg(feature = "parquet_encryption")] -impl From<&FileDecryptionProperties> for ConfigFileDecryptionProperties { - fn from(f: &FileDecryptionProperties) -> Self { +impl From<&Arc> for ConfigFileDecryptionProperties { + fn from(f: &Arc) -> Self { let (column_names_vec, column_keys_vec) = f.column_keys(); let mut column_decryption_properties: HashMap< String, @@ -2834,6 +2837,7 @@ mod tests { }; use std::any::Any; use std::collections::HashMap; + use std::sync::Arc; #[derive(Default, Debug, Clone)] pub struct TestExtensionConfig { @@ -2990,16 +2994,15 @@ mod tests { .unwrap(); // Test round-trip - let config_encrypt: ConfigFileEncryptionProperties = - (&file_encryption_properties).into(); - let encryption_properties_built: FileEncryptionProperties = - config_encrypt.clone().into(); + let config_encrypt = + ConfigFileEncryptionProperties::from(&file_encryption_properties); + let encryption_properties_built = + Arc::new(FileEncryptionProperties::from(config_encrypt.clone())); assert_eq!(file_encryption_properties, encryption_properties_built); - let config_decrypt: ConfigFileDecryptionProperties = - (&decryption_properties).into(); - let decryption_properties_built: FileDecryptionProperties = - config_decrypt.clone().into(); + let config_decrypt = ConfigFileDecryptionProperties::from(&decryption_properties); + let decryption_properties_built = + Arc::new(FileDecryptionProperties::from(config_decrypt.clone())); assert_eq!(decryption_properties, decryption_properties_built); /////////////////////////////////////////////////////////////////////////////////// diff --git a/datafusion/common/src/datatype.rs b/datafusion/common/src/datatype.rs new file mode 100644 index 000000000000..65f639521186 --- /dev/null +++ b/datafusion/common/src/datatype.rs @@ -0,0 +1,173 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`DataTypeExt`] and [`FieldExt`] extension trait for working with DataTypes to Fields + +use crate::arrow::datatypes::{DataType, Field, FieldRef}; +use std::sync::Arc; + +/// DataFusion extension methods for Arrow [`DataType`] +pub trait DataTypeExt { + /// Convert the type to field with nullable type and "" name + /// + /// This is used to track the places where we convert a [`DataType`] + /// into a nameless field to interact with an API that is + /// capable of representing an extension type and/or nullability. + /// + /// For example, it will convert a `DataType::Int32` into + /// `Field::new("", DataType::Int32, true)`. + /// + /// ``` + /// # use datafusion_common::datatype::DataTypeExt; + /// # use arrow::datatypes::DataType; + /// let dt = DataType::Utf8; + /// let field = dt.into_nullable_field(); + /// // result is a nullable Utf8 field with "" name + /// assert_eq!(field.name(), ""); + /// assert_eq!(field.data_type(), &DataType::Utf8); + /// assert!(field.is_nullable()); + /// ``` + fn into_nullable_field(self) -> Field; + + /// Convert the type to [`FieldRef`] with nullable type and "" name + /// + /// Concise wrapper around [`DataTypeExt::into_nullable_field`] that + /// constructs a [`FieldRef`]. + fn into_nullable_field_ref(self) -> FieldRef; +} + +impl DataTypeExt for DataType { + fn into_nullable_field(self) -> Field { + Field::new("", self, true) + } + + fn into_nullable_field_ref(self) -> FieldRef { + Arc::new(Field::new("", self, true)) + } +} + +/// DataFusion extension methods for Arrow [`Field`] and [`FieldRef`] +pub trait FieldExt { + /// Returns a new Field representing a List of this Field's DataType. + /// + /// For example if input represents an `Int32`, the return value will + /// represent a `List`. + /// + /// Example: + /// ``` + /// # use std::sync::Arc; + /// # use arrow::datatypes::{DataType, Field}; + /// # use datafusion_common::datatype::FieldExt; + /// // Int32 field + /// let int_field = Field::new("my_int", DataType::Int32, true); + /// // convert to a List field + /// let list_field = int_field.into_list(); + /// // List + /// // Note that the item field name has been renamed to "item" + /// assert_eq!(list_field.data_type(), &DataType::List(Arc::new( + /// Field::new("item", DataType::Int32, true) + /// ))); + fn into_list(self) -> Self; + + /// Return a new Field representing this Field as the item type of a + /// [`DataType::FixedSizeList`] + /// + /// For example if input represents an `Int32`, the return value will + /// represent a `FixedSizeList`. + /// + /// Example: + /// ``` + /// # use std::sync::Arc; + /// # use arrow::datatypes::{DataType, Field}; + /// # use datafusion_common::datatype::FieldExt; + /// // Int32 field + /// let int_field = Field::new("my_int", DataType::Int32, true); + /// // convert to a FixedSizeList field of size 3 + /// let fixed_size_list_field = int_field.into_fixed_size_list(3); + /// // FixedSizeList + /// // Note that the item field name has been renamed to "item" + /// assert_eq!( + /// fixed_size_list_field.data_type(), + /// &DataType::FixedSizeList(Arc::new( + /// Field::new("item", DataType::Int32, true)), + /// 3 + /// )); + fn into_fixed_size_list(self, list_size: i32) -> Self; + + /// Update the field to have the default list field name ("item") + /// + /// Lists are allowed to have an arbitrarily named field; however, a name + /// other than 'item' will cause it to fail an == check against a more + /// idiomatically created list in arrow-rs which causes issues. + /// + /// For example, if input represents an `Int32` field named "my_int", + /// the return value will represent an `Int32` field named "item". + /// + /// Example: + /// ``` + /// # use arrow::datatypes::Field; + /// # use datafusion_common::datatype::FieldExt; + /// let my_field = Field::new("my_int", arrow::datatypes::DataType::Int32, true); + /// let item_field = my_field.into_list_item(); + /// assert_eq!(item_field.name(), Field::LIST_FIELD_DEFAULT_NAME); + /// assert_eq!(item_field.name(), "item"); + /// ``` + fn into_list_item(self) -> Self; +} + +impl FieldExt for Field { + fn into_list(self) -> Self { + DataType::List(Arc::new(self.into_list_item())).into_nullable_field() + } + + fn into_fixed_size_list(self, list_size: i32) -> Self { + DataType::FixedSizeList(self.into_list_item().into(), list_size) + .into_nullable_field() + } + + fn into_list_item(self) -> Self { + if self.name() != Field::LIST_FIELD_DEFAULT_NAME { + self.with_name(Field::LIST_FIELD_DEFAULT_NAME) + } else { + self + } + } +} + +impl FieldExt for Arc { + fn into_list(self) -> Self { + DataType::List(self.into_list_item()) + .into_nullable_field() + .into() + } + + fn into_fixed_size_list(self, list_size: i32) -> Self { + DataType::FixedSizeList(self.into_list_item(), list_size) + .into_nullable_field() + .into() + } + + fn into_list_item(self) -> Self { + if self.name() != Field::LIST_FIELD_DEFAULT_NAME { + Arc::unwrap_or_clone(self) + .with_name(Field::LIST_FIELD_DEFAULT_NAME) + .into() + } else { + self + } + } +} diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 6866b4011f9e..24d152a7dba8 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -56,12 +56,10 @@ pub type DFSchemaRef = Arc; /// an Arrow schema. /// /// ```rust -/// use datafusion_common::{DFSchema, Column}; /// use arrow::datatypes::{DataType, Field, Schema}; +/// use datafusion_common::{Column, DFSchema}; /// -/// let arrow_schema = Schema::new(vec![ -/// Field::new("c1", DataType::Int32, false), -/// ]); +/// let arrow_schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); /// /// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema).unwrap(); /// let column = Column::from_qualified_name("t1.c1"); @@ -77,12 +75,10 @@ pub type DFSchemaRef = Arc; /// Create an unqualified schema using TryFrom: /// /// ```rust -/// use datafusion_common::{DFSchema, Column}; /// use arrow::datatypes::{DataType, Field, Schema}; +/// use datafusion_common::{Column, DFSchema}; /// -/// let arrow_schema = Schema::new(vec![ -/// Field::new("c1", DataType::Int32, false), -/// ]); +/// let arrow_schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); /// /// let df_schema = DFSchema::try_from(arrow_schema).unwrap(); /// let column = Column::new_unqualified("c1"); @@ -94,13 +90,15 @@ pub type DFSchemaRef = Arc; /// Use the `Into` trait to convert `DFSchema` into an Arrow schema: /// /// ```rust +/// use arrow::datatypes::{Field, Schema}; /// use datafusion_common::DFSchema; -/// use arrow::datatypes::{Schema, Field}; /// use std::collections::HashMap; /// -/// let df_schema = DFSchema::from_unqualified_fields(vec![ -/// Field::new("c1", arrow::datatypes::DataType::Int32, false), -/// ].into(),HashMap::new()).unwrap(); +/// let df_schema = DFSchema::from_unqualified_fields( +/// vec![Field::new("c1", arrow::datatypes::DataType::Int32, false)].into(), +/// HashMap::new(), +/// ) +/// .unwrap(); /// let schema: &Schema = df_schema.as_arrow(); /// assert_eq!(schema.fields().len(), 1); /// ``` @@ -884,22 +882,26 @@ impl DFSchema { /// # Example /// /// ``` - /// use datafusion_common::DFSchema; /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_common::DFSchema; /// use std::collections::HashMap; /// /// let schema = DFSchema::from_unqualified_fields( /// vec![ /// Field::new("id", DataType::Int32, false), /// Field::new("name", DataType::Utf8, true), - /// ].into(), - /// HashMap::new() - /// ).unwrap(); + /// ] + /// .into(), + /// HashMap::new(), + /// ) + /// .unwrap(); /// - /// assert_eq!(schema.tree_string().to_string(), - /// r#"root + /// assert_eq!( + /// schema.tree_string().to_string(), + /// r#"root /// |-- id: int32 (nullable = false) - /// |-- name: utf8 (nullable = true)"#); + /// |-- name: utf8 (nullable = true)"# + /// ); /// ``` pub fn tree_string(&self) -> impl Display + '_ { let mut result = String::from("root\n"); @@ -1417,7 +1419,7 @@ mod tests { fn from_qualified_schema_into_arrow_schema() -> Result<()> { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let arrow_schema = schema.as_arrow(); - insta::assert_snapshot!(arrow_schema, @r#"Field { name: "c0", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c1", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }"#); + insta::assert_snapshot!(arrow_schema.to_string(), @r#"Field { "c0": nullable Boolean }, Field { "c1": nullable Boolean }"#); Ok(()) } diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index 0dce8e6a56ec..b25bf1c12e44 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -30,8 +30,11 @@ use crate::Span; /// ```rust /// # use datafusion_common::{Location, Span, Diagnostic}; /// let span = Some(Span { -/// start: Location{ line: 2, column: 1 }, -/// end: Location{ line: 4, column: 15 } +/// start: Location { line: 2, column: 1 }, +/// end: Location { +/// line: 4, +/// column: 15, +/// }, /// }); /// let diagnostic = Diagnostic::new_error("Something went wrong", span) /// .with_help("Have you tried turning it on and off again?", None); diff --git a/datafusion/common/src/encryption.rs b/datafusion/common/src/encryption.rs index b764ad77cff1..2a8cfdbc8996 100644 --- a/datafusion/common/src/encryption.rs +++ b/datafusion/common/src/encryption.rs @@ -24,38 +24,10 @@ pub use parquet::encryption::decrypt::FileDecryptionProperties; pub use parquet::encryption::encrypt::FileEncryptionProperties; #[cfg(not(feature = "parquet_encryption"))] -#[derive(Default, Debug)] +#[derive(Default, Clone, Debug)] pub struct FileDecryptionProperties; #[cfg(not(feature = "parquet_encryption"))] -#[derive(Default, Debug)] +#[derive(Default, Clone, Debug)] pub struct FileEncryptionProperties; pub use crate::config::{ConfigFileDecryptionProperties, ConfigFileEncryptionProperties}; - -#[cfg(feature = "parquet_encryption")] -pub fn map_encryption_to_config_encryption( - encryption: Option<&FileEncryptionProperties>, -) -> Option { - encryption.map(|fe| fe.into()) -} - -#[cfg(not(feature = "parquet_encryption"))] -pub fn map_encryption_to_config_encryption( - _encryption: Option<&FileEncryptionProperties>, -) -> Option { - None -} - -#[cfg(feature = "parquet_encryption")] -pub fn map_config_decryption_to_decryption( - decryption: &ConfigFileDecryptionProperties, -) -> FileDecryptionProperties { - decryption.clone().into() -} - -#[cfg(not(feature = "parquet_encryption"))] -pub fn map_config_decryption_to_decryption( - _decryption: &ConfigFileDecryptionProperties, -) -> FileDecryptionProperties { - FileDecryptionProperties {} -} diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 210f0442972d..fde52944d049 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -684,7 +684,10 @@ impl DataFusionError { /// let mut builder = DataFusionError::builder(); /// builder.add_error(DataFusionError::Internal("foo".to_owned())); /// // ok_or returns the value if no errors have been added -/// assert_contains!(builder.error_or(42).unwrap_err().to_string(), "Internal error: foo"); +/// assert_contains!( +/// builder.error_or(42).unwrap_err().to_string(), +/// "Internal error: foo" +/// ); /// ``` #[derive(Debug, Default)] pub struct DataFusionErrorBuilder(Vec); @@ -702,7 +705,10 @@ impl DataFusionErrorBuilder { /// # use datafusion_common::{assert_contains, DataFusionError}; /// let mut builder = DataFusionError::builder(); /// builder.add_error(DataFusionError::Internal("foo".to_owned())); - /// assert_contains!(builder.error_or(42).unwrap_err().to_string(), "Internal error: foo"); + /// assert_contains!( + /// builder.error_or(42).unwrap_err().to_string(), + /// "Internal error: foo" + /// ); /// ``` pub fn add_error(&mut self, error: DataFusionError) { self.0.push(error); @@ -714,8 +720,11 @@ impl DataFusionErrorBuilder { /// ``` /// # use datafusion_common::{assert_contains, DataFusionError}; /// let builder = DataFusionError::builder() - /// .with_error(DataFusionError::Internal("foo".to_owned())); - /// assert_contains!(builder.error_or(42).unwrap_err().to_string(), "Internal error: foo"); + /// .with_error(DataFusionError::Internal("foo".to_owned())); + /// assert_contains!( + /// builder.error_or(42).unwrap_err().to_string(), + /// "Internal error: foo" + /// ); /// ``` pub fn with_error(mut self, error: DataFusionError) -> Self { self.0.push(error); diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 3977f2b489e1..564929c61bab 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -402,15 +402,14 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result, +} + +impl ScalarAndMetadata { + /// Create a new Literal from a scalar value with optional [`FieldMetadata`] + pub fn new(value: ScalarValue, metadata: Option) -> Self { + Self { value, metadata } + } + + /// Access the underlying [ScalarValue] storage + pub fn value(&self) -> &ScalarValue { + &self.value + } + + /// Access the [FieldMetadata] attached to this value, if any + pub fn metadata(&self) -> Option<&FieldMetadata> { + self.metadata.as_ref() + } + + /// Consume self and return components + pub fn into_inner(self) -> (ScalarValue, Option) { + (self.value, self.metadata) + } + + /// Cast this values's storage type + /// + /// This operation assumes that if the underlying [ScalarValue] can be casted + /// to a given type that any extension type represented by the metadata is also + /// valid. + pub fn cast_storage_to( + &self, + target_type: &DataType, + ) -> Result { + let new_value = self.value().cast_to(target_type)?; + Ok(Self::new(new_value, self.metadata.clone())) + } +} + +/// create a new ScalarAndMetadata from a ScalarValue without +/// any metadata +impl From for ScalarAndMetadata { + fn from(value: ScalarValue) -> Self { + Self::new(value, None) + } +} + +/// Assert equality of data types where one or both sides may have field metadata +/// +/// This currently compares absent metadata (e.g., one side was a DataType) and +/// empty metadata (e.g., one side was a field where the field had no metadata) +/// as equal and uses byte-for-byte comparison for the keys and values of the +/// fields, even though this is potentially too strict for some cases (e.g., +/// extension types where extension metadata is represented by JSON, or cases +/// where field metadata is orthogonal to the interpretation of the data type). +/// +/// Returns a planning error with suitably formatted type representations if +/// actual and expected do not compare to equal. +pub fn check_metadata_with_storage_equal( + actual: ( + &DataType, + Option<&std::collections::HashMap>, + ), + expected: ( + &DataType, + Option<&std::collections::HashMap>, + ), + what: &str, + context: &str, +) -> Result<(), DataFusionError> { + if actual.0 != expected.0 { + return _plan_err!( + "Expected {what} of type {}, got {}{context}", + format_type_and_metadata(expected.0, expected.1), + format_type_and_metadata(actual.0, actual.1) + ); + } + + let metadata_equal = match (actual.1, expected.1) { + (None, None) => true, + (None, Some(expected_metadata)) => expected_metadata.is_empty(), + (Some(actual_metadata), None) => actual_metadata.is_empty(), + (Some(actual_metadata), Some(expected_metadata)) => { + actual_metadata == expected_metadata + } + }; + + if !metadata_equal { + return _plan_err!( + "Expected {what} of type {}, got {}{context}", + format_type_and_metadata(expected.0, expected.1), + format_type_and_metadata(actual.0, actual.1) + ); + } + + Ok(()) +} + +/// Given a data type represented by storage and optional metadata, generate +/// a user-facing string +/// +/// This function exists to reduce the number of Field debug strings that are +/// used to communicate type information in error messages and plan explain +/// renderings. +pub fn format_type_and_metadata( + data_type: &DataType, + metadata: Option<&std::collections::HashMap>, +) -> String { + match metadata { + Some(metadata) if !metadata.is_empty() => { + format!("{data_type}<{metadata:?}>") + } + _ => data_type.to_string(), + } +} + +/// Literal metadata +/// +/// Stores metadata associated with a literal expressions +/// and is designed to be fast to `clone`. +/// +/// This structure is used to store metadata associated with a literal expression, and it +/// corresponds to the `metadata` field on [`Field`]. +/// +/// # Example: Create [`FieldMetadata`] from a [`Field`] +/// ``` +/// # use std::collections::HashMap; +/// # use datafusion_common::metadata::FieldMetadata; +/// # use arrow::datatypes::{Field, DataType}; +/// # let field = Field::new("c1", DataType::Int32, true) +/// # .with_metadata(HashMap::from([("foo".to_string(), "bar".to_string())])); +/// // Create a new `FieldMetadata` instance from a `Field` +/// let metadata = FieldMetadata::new_from_field(&field); +/// // There is also a `From` impl: +/// let metadata = FieldMetadata::from(&field); +/// ``` +/// +/// # Example: Update a [`Field`] with [`FieldMetadata`] +/// ``` +/// # use datafusion_common::metadata::FieldMetadata; +/// # use arrow::datatypes::{Field, DataType}; +/// # let field = Field::new("c1", DataType::Int32, true); +/// # let metadata = FieldMetadata::new_from_field(&field); +/// // Add any metadata from `FieldMetadata` to `Field` +/// let updated_field = metadata.add_to_field(field); +/// ``` +#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] +pub struct FieldMetadata { + /// The inner metadata of a literal expression, which is a map of string + /// keys to string values. + /// + /// Note this is not a `HashMap` because `HashMap` does not provide + /// implementations for traits like `Debug` and `Hash`. + inner: Arc>, +} + +impl Default for FieldMetadata { + fn default() -> Self { + Self::new_empty() + } +} + +impl FieldMetadata { + /// Create a new empty metadata instance. + pub fn new_empty() -> Self { + Self { + inner: Arc::new(BTreeMap::new()), + } + } + + /// Merges two optional `FieldMetadata` instances, overwriting any existing + /// keys in `m` with keys from `n` if present. + /// + /// This function is commonly used in alias operations, particularly for literals + /// with metadata. When creating an alias expression, the metadata from the original + /// expression (such as a literal) is combined with any metadata specified on the alias. + /// + /// # Arguments + /// + /// * `m` - The first metadata (typically from the original expression like a literal) + /// * `n` - The second metadata (typically from the alias definition) + /// + /// # Merge Strategy + /// + /// - If both metadata instances exist, they are merged with `n` taking precedence + /// - Keys from `n` will overwrite keys from `m` if they have the same name + /// - If only one metadata instance exists, it is returned unchanged + /// - If neither exists, `None` is returned + /// + /// # Example usage + /// ```rust + /// use datafusion_common::metadata::FieldMetadata; + /// use std::collections::BTreeMap; + /// + /// // Create metadata for a literal expression + /// let literal_metadata = Some(FieldMetadata::from(BTreeMap::from([ + /// ("source".to_string(), "constant".to_string()), + /// ("type".to_string(), "int".to_string()), + /// ]))); + /// + /// // Create metadata for an alias + /// let alias_metadata = Some(FieldMetadata::from(BTreeMap::from([ + /// ("description".to_string(), "answer".to_string()), + /// ("source".to_string(), "user".to_string()), // This will override literal's "source" + /// ]))); + /// + /// // Merge the metadata + /// let merged = FieldMetadata::merge_options( + /// literal_metadata.as_ref(), + /// alias_metadata.as_ref(), + /// ); + /// + /// // Result contains: {"source": "user", "type": "int", "description": "answer"} + /// assert!(merged.is_some()); + /// ``` + pub fn merge_options( + m: Option<&FieldMetadata>, + n: Option<&FieldMetadata>, + ) -> Option { + match (m, n) { + (Some(m), Some(n)) => { + let mut merged = m.clone(); + merged.extend(n.clone()); + Some(merged) + } + (Some(m), None) => Some(m.clone()), + (None, Some(n)) => Some(n.clone()), + (None, None) => None, + } + } + + /// Create a new metadata instance from a `Field`'s metadata. + pub fn new_from_field(field: &Field) -> Self { + let inner = field + .metadata() + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + Self { + inner: Arc::new(inner), + } + } + + /// Create a new metadata instance from a map of string keys to string values. + pub fn new(inner: BTreeMap) -> Self { + Self { + inner: Arc::new(inner), + } + } + + /// Get the inner metadata as a reference to a `BTreeMap`. + pub fn inner(&self) -> &BTreeMap { + &self.inner + } + + /// Return the inner metadata + pub fn into_inner(self) -> Arc> { + self.inner + } + + /// Adds metadata from `other` into `self`, overwriting any existing keys. + pub fn extend(&mut self, other: Self) { + if other.is_empty() { + return; + } + let other = Arc::unwrap_or_clone(other.into_inner()); + Arc::make_mut(&mut self.inner).extend(other); + } + + /// Returns true if the metadata is empty. + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + /// Returns the number of key-value pairs in the metadata. + pub fn len(&self) -> usize { + self.inner.len() + } + + /// Convert this `FieldMetadata` into a `HashMap` + pub fn to_hashmap(&self) -> std::collections::HashMap { + self.inner + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect() + } + + /// Updates the metadata on the Field with this metadata, if it is not empty. + pub fn add_to_field(&self, field: Field) -> Field { + if self.inner.is_empty() { + return field; + } + + field.with_metadata(self.to_hashmap()) + } +} + +impl From<&Field> for FieldMetadata { + fn from(field: &Field) -> Self { + Self::new_from_field(field) + } +} + +impl From> for FieldMetadata { + fn from(inner: BTreeMap) -> Self { + Self::new(inner) + } +} + +impl From> for FieldMetadata { + fn from(map: std::collections::HashMap) -> Self { + Self::new(map.into_iter().collect()) + } +} + +/// From reference +impl From<&std::collections::HashMap> for FieldMetadata { + fn from(map: &std::collections::HashMap) -> Self { + let inner = map + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + Self::new(inner) + } +} + +/// From hashbrown map +impl From> for FieldMetadata { + fn from(map: HashMap) -> Self { + let inner = map.into_iter().collect(); + Self::new(inner) + } +} + +impl From<&HashMap> for FieldMetadata { + fn from(map: &HashMap) -> Self { + let inner = map + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + Self::new(inner) + } +} diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index 38060e370bfa..d43816f75b0e 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -110,16 +110,19 @@ fn cast_struct_column( /// temporal values are formatted when cast to strings. /// /// ``` -/// use std::sync::Arc; -/// use arrow::array::{Int64Array, ArrayRef}; +/// use arrow::array::{ArrayRef, Int64Array}; /// use arrow::compute::CastOptions; /// use arrow::datatypes::{DataType, Field}; /// use datafusion_common::nested_struct::cast_column; +/// use std::sync::Arc; /// /// let source: ArrayRef = Arc::new(Int64Array::from(vec![1, i64::MAX])); /// let target = Field::new("ints", DataType::Int32, true); /// // Permit lossy conversions by producing NULL on overflow instead of erroring -/// let options = CastOptions { safe: true, ..Default::default() }; +/// let options = CastOptions { +/// safe: true, +/// ..Default::default() +/// }; /// let result = cast_column(&source, &target, &options).unwrap(); /// assert!(result.is_null(1)); /// ``` diff --git a/datafusion/common/src/param_value.rs b/datafusion/common/src/param_value.rs index 7582cff56f87..ebf68e4dd210 100644 --- a/datafusion/common/src/param_value.rs +++ b/datafusion/common/src/param_value.rs @@ -16,22 +16,37 @@ // under the License. use crate::error::{_plan_datafusion_err, _plan_err}; +use crate::metadata::{check_metadata_with_storage_equal, ScalarAndMetadata}; use crate::{Result, ScalarValue}; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, Field, FieldRef}; use std::collections::HashMap; /// The parameter value corresponding to the placeholder #[derive(Debug, Clone)] pub enum ParamValues { /// For positional query parameters, like `SELECT * FROM test WHERE a > $1 AND b = $2` - List(Vec), + List(Vec), /// For named query parameters, like `SELECT * FROM test WHERE a > $foo AND b = $goo` - Map(HashMap), + Map(HashMap), } impl ParamValues { - /// Verify parameter list length and type + /// Verify parameter list length and DataType + /// + /// Use [`ParamValues::verify_fields`] to ensure field metadata is considered when + /// computing type equality. + #[deprecated(since = "51.0.0", note = "Use verify_fields instead")] pub fn verify(&self, expect: &[DataType]) -> Result<()> { + // make dummy Fields + let expect = expect + .iter() + .map(|dt| Field::new("", dt.clone(), true).into()) + .collect::>(); + self.verify_fields(&expect) + } + + /// Verify parameter list length and type + pub fn verify_fields(&self, expect: &[FieldRef]) -> Result<()> { match self { ParamValues::List(list) => { // Verify if the number of params matches the number of values @@ -45,15 +60,16 @@ impl ParamValues { // Verify if the types of the params matches the types of the values let iter = expect.iter().zip(list.iter()); - for (i, (param_type, value)) in iter.enumerate() { - if *param_type != value.data_type() { - return _plan_err!( - "Expected parameter of type {}, got {:?} at index {}", - param_type, - value.data_type(), - i - ); - } + for (i, (param_type, lit)) in iter.enumerate() { + check_metadata_with_storage_equal( + ( + &lit.value.data_type(), + lit.metadata.as_ref().map(|m| m.to_hashmap()).as_ref(), + ), + (param_type.data_type(), Some(param_type.metadata())), + "parameter", + &format!(" at index {i}"), + )?; } Ok(()) } @@ -65,7 +81,7 @@ impl ParamValues { } } - pub fn get_placeholders_with_values(&self, id: &str) -> Result { + pub fn get_placeholders_with_values(&self, id: &str) -> Result { match self { ParamValues::List(list) => { if id.is_empty() { @@ -99,7 +115,7 @@ impl ParamValues { impl From> for ParamValues { fn from(value: Vec) -> Self { - Self::List(value) + Self::List(value.into_iter().map(ScalarAndMetadata::from).collect()) } } @@ -108,8 +124,10 @@ where K: Into, { fn from(value: Vec<(K, ScalarValue)>) -> Self { - let value: HashMap = - value.into_iter().map(|(k, v)| (k.into(), v)).collect(); + let value: HashMap = value + .into_iter() + .map(|(k, v)| (k.into(), ScalarAndMetadata::from(v))) + .collect(); Self::Map(value) } } @@ -119,8 +137,10 @@ where K: Into, { fn from(value: HashMap) -> Self { - let value: HashMap = - value.into_iter().map(|(k, v)| (k.into(), v)).collect(); + let value: HashMap = value + .into_iter() + .map(|(k, v)| (k.into(), ScalarAndMetadata::from(v))) + .collect(); Self::Map(value) } } diff --git a/datafusion/common/src/pyarrow.rs b/datafusion/common/src/pyarrow.rs index ff413e08ab07..3b7d80b3da78 100644 --- a/datafusion/common/src/pyarrow.rs +++ b/datafusion/common/src/pyarrow.rs @@ -22,7 +22,7 @@ use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use pyo3::exceptions::PyException; use pyo3::prelude::PyErr; use pyo3::types::{PyAnyMethods, PyList}; -use pyo3::{Bound, FromPyObject, IntoPyObject, PyAny, PyObject, PyResult, Python}; +use pyo3::{Bound, FromPyObject, IntoPyObject, PyAny, PyResult, Python}; use crate::{DataFusionError, ScalarValue}; @@ -52,11 +52,11 @@ impl FromPyArrow for ScalarValue { } impl ToPyArrow for ScalarValue { - fn to_pyarrow(&self, py: Python) -> PyResult { + fn to_pyarrow<'py>(&self, py: Python<'py>) -> PyResult> { let array = self.to_array()?; // convert to pyarrow array using C data interface let pyarray = array.to_data().to_pyarrow(py)?; - let pyscalar = pyarray.call_method1(py, "__getitem__", (0,))?; + let pyscalar = pyarray.call_method1("__getitem__", (0,))?; Ok(pyscalar) } @@ -79,23 +79,22 @@ impl<'source> IntoPyObject<'source> for ScalarValue { let array = self.to_array()?; // convert to pyarrow array using C data interface let pyarray = array.to_data().to_pyarrow(py)?; - let pyarray_bound = pyarray.bind(py); - pyarray_bound.call_method1("__getitem__", (0,)) + pyarray.call_method1("__getitem__", (0,)) } } #[cfg(test)] mod tests { use pyo3::ffi::c_str; - use pyo3::prepare_freethreaded_python; use pyo3::py_run; use pyo3::types::PyDict; + use pyo3::Python; use super::*; fn init_python() { - prepare_freethreaded_python(); - Python::with_gil(|py| { + Python::initialize(); + Python::attach(|py| { if py.run(c_str!("import pyarrow"), None, None).is_err() { let locals = PyDict::new(py); py.run( @@ -135,12 +134,11 @@ mod tests { ScalarValue::Date32(Some(1234)), ]; - Python::with_gil(|py| { + Python::attach(|py| { for scalar in example_scalars.iter() { - let result = ScalarValue::from_pyarrow_bound( - scalar.to_pyarrow(py).unwrap().bind(py), - ) - .unwrap(); + let result = + ScalarValue::from_pyarrow_bound(&scalar.to_pyarrow(py).unwrap()) + .unwrap(); assert_eq!(scalar, &result); } }); @@ -150,7 +148,7 @@ mod tests { fn test_py_scalar() -> PyResult<()> { init_python(); - Python::with_gil(|py| -> PyResult<()> { + Python::attach(|py| -> PyResult<()> { let scalar_float = ScalarValue::Float64(Some(12.34)); let py_float = scalar_float .into_pyobject(py)? diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index a70a027a8fac..f2546040ffd7 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -171,9 +171,9 @@ pub use struct_builder::ScalarStructBuilder; /// let field_b = Field::new("b", DataType::Utf8, false); /// /// let s1 = ScalarStructBuilder::new() -/// .with_scalar(field_a, ScalarValue::from(1i32)) -/// .with_scalar(field_b, ScalarValue::from("foo")) -/// .build(); +/// .with_scalar(field_a, ScalarValue::from(1i32)) +/// .with_scalar(field_b, ScalarValue::from("foo")) +/// .build(); /// ``` /// /// ## Example: Creating a null [`ScalarValue::Struct`] using [`ScalarStructBuilder`] @@ -199,13 +199,13 @@ pub use struct_builder::ScalarStructBuilder; /// // Build a struct like: {a: 1, b: "foo"} /// // Field description /// let fields = Fields::from(vec![ -/// Field::new("a", DataType::Int32, false), -/// Field::new("b", DataType::Utf8, false), +/// Field::new("a", DataType::Int32, false), +/// Field::new("b", DataType::Utf8, false), /// ]); /// // one row arrays for each field /// let arrays: Vec = vec![ -/// Arc::new(Int32Array::from(vec![1])), -/// Arc::new(StringArray::from(vec!["foo"])), +/// Arc::new(Int32Array::from(vec![1])), +/// Arc::new(StringArray::from(vec!["foo"])), /// ]; /// // no nulls for this array /// let nulls = None; @@ -1068,8 +1068,8 @@ impl ScalarValue { /// /// Example /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::datatypes::DataType; + /// use datafusion_common::ScalarValue; /// /// let scalar = ScalarValue::try_new_null(&DataType::Int32).unwrap(); /// assert_eq!(scalar.is_null(), true); @@ -2231,23 +2231,16 @@ impl ScalarValue { /// /// # Example /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::{BooleanArray, Int32Array}; + /// use datafusion_common::ScalarValue; /// /// let arr = Int32Array::from(vec![Some(1), None, Some(10)]); /// let five = ScalarValue::Int32(Some(5)); /// - /// let result = arrow::compute::kernels::cmp::lt( - /// &arr, - /// &five.to_scalar().unwrap(), - /// ).unwrap(); + /// let result = + /// arrow::compute::kernels::cmp::lt(&arr, &five.to_scalar().unwrap()).unwrap(); /// - /// let expected = BooleanArray::from(vec![ - /// Some(true), - /// None, - /// Some(false) - /// ] - /// ); + /// let expected = BooleanArray::from(vec![Some(true), None, Some(false)]); /// /// assert_eq!(&result, &expected); /// ``` @@ -2265,26 +2258,20 @@ impl ScalarValue { /// /// # Example /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::{ArrayRef, BooleanArray}; + /// use datafusion_common::ScalarValue; /// /// let scalars = vec![ - /// ScalarValue::Boolean(Some(true)), - /// ScalarValue::Boolean(None), - /// ScalarValue::Boolean(Some(false)), + /// ScalarValue::Boolean(Some(true)), + /// ScalarValue::Boolean(None), + /// ScalarValue::Boolean(Some(false)), /// ]; /// /// // Build an Array from the list of ScalarValues - /// let array = ScalarValue::iter_to_array(scalars.into_iter()) - /// .unwrap(); + /// let array = ScalarValue::iter_to_array(scalars.into_iter()).unwrap(); /// - /// let expected: ArrayRef = std::sync::Arc::new( - /// BooleanArray::from(vec![ - /// Some(true), - /// None, - /// Some(false) - /// ] - /// )); + /// let expected: ArrayRef = + /// std::sync::Arc::new(BooleanArray::from(vec![Some(true), None, Some(false)])); /// /// assert_eq!(&array, &expected); /// ``` @@ -2731,23 +2718,24 @@ impl ScalarValue { /// /// Example /// ``` - /// use datafusion_common::ScalarValue; - /// use arrow::array::{ListArray, Int32Array}; + /// use arrow::array::{Int32Array, ListArray}; /// use arrow::datatypes::{DataType, Int32Type}; /// use datafusion_common::cast::as_list_array; + /// use datafusion_common::ScalarValue; /// /// let scalars = vec![ - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(None), - /// ScalarValue::Int32(Some(2)) + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(None), + /// ScalarValue::Int32(Some(2)), /// ]; /// /// let result = ScalarValue::new_list(&scalars, &DataType::Int32, true); /// - /// let expected = ListArray::from_iter_primitive::( - /// vec![ - /// Some(vec![Some(1), None, Some(2)]) - /// ]); + /// let expected = ListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(1), + /// None, + /// Some(2), + /// ])]); /// /// assert_eq!(*result, expected); /// ``` @@ -2791,23 +2779,25 @@ impl ScalarValue { /// /// Example /// ``` - /// use datafusion_common::ScalarValue; - /// use arrow::array::{ListArray, Int32Array}; + /// use arrow::array::{Int32Array, ListArray}; /// use arrow::datatypes::{DataType, Int32Type}; /// use datafusion_common::cast::as_list_array; + /// use datafusion_common::ScalarValue; /// /// let scalars = vec![ - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(None), - /// ScalarValue::Int32(Some(2)) + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(None), + /// ScalarValue::Int32(Some(2)), /// ]; /// - /// let result = ScalarValue::new_list_from_iter(scalars.into_iter(), &DataType::Int32, true); + /// let result = + /// ScalarValue::new_list_from_iter(scalars.into_iter(), &DataType::Int32, true); /// - /// let expected = ListArray::from_iter_primitive::( - /// vec![ - /// Some(vec![Some(1), None, Some(2)]) - /// ]); + /// let expected = ListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(1), + /// None, + /// Some(2), + /// ])]); /// /// assert_eq!(*result, expected); /// ``` @@ -2833,23 +2823,25 @@ impl ScalarValue { /// /// Example /// ``` - /// use datafusion_common::ScalarValue; - /// use arrow::array::{LargeListArray, Int32Array}; + /// use arrow::array::{Int32Array, LargeListArray}; /// use arrow::datatypes::{DataType, Int32Type}; /// use datafusion_common::cast::as_large_list_array; + /// use datafusion_common::ScalarValue; /// /// let scalars = vec![ - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(None), - /// ScalarValue::Int32(Some(2)) + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(None), + /// ScalarValue::Int32(Some(2)), /// ]; /// /// let result = ScalarValue::new_large_list(&scalars, &DataType::Int32); /// - /// let expected = LargeListArray::from_iter_primitive::( - /// vec![ - /// Some(vec![Some(1), None, Some(2)]) - /// ]); + /// let expected = + /// LargeListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(1), + /// None, + /// Some(2), + /// ])]); /// /// assert_eq!(*result, expected); /// ``` @@ -3248,14 +3240,14 @@ impl ScalarValue { /// /// Example 1: Array (ScalarValue::Int32) /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::ListArray; /// use arrow::datatypes::{DataType, Int32Type}; + /// use datafusion_common::ScalarValue; /// /// // Equivalent to [[1,2,3], [4,5]] /// let list_arr = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(1), Some(2), Some(3)]), - /// Some(vec![Some(4), Some(5)]) + /// Some(vec![Some(1), Some(2), Some(3)]), + /// Some(vec![Some(4), Some(5)]), /// ]); /// /// // Convert the array into Scalar Values for each row @@ -3278,15 +3270,15 @@ impl ScalarValue { /// /// Example 2: Nested array (ScalarValue::List) /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::ListArray; /// use arrow::datatypes::{DataType, Int32Type}; /// use datafusion_common::utils::SingleRowListArrayBuilder; + /// use datafusion_common::ScalarValue; /// use std::sync::Arc; /// /// let list_arr = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(1), Some(2), Some(3)]), - /// Some(vec![Some(4), Some(5)]) + /// Some(vec![Some(1), Some(2), Some(3)]), + /// Some(vec![Some(4), Some(5)]), /// ]); /// /// // Wrap into another layer of list, we got nested array as [ [[1,2,3], [4,5]] ] @@ -3295,33 +3287,34 @@ impl ScalarValue { /// // Convert the array into Scalar Values for each row, we got 1D arrays in this example /// let scalar_vec = ScalarValue::convert_array_to_scalar_vec(&list_arr).unwrap(); /// - /// let l1 = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(1), Some(2), Some(3)]), - /// ]); - /// let l2 = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(4), Some(5)]), - /// ]); + /// let l1 = ListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(1), + /// Some(2), + /// Some(3), + /// ])]); + /// let l2 = ListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(4), + /// Some(5), + /// ])]); /// - /// let expected = vec![ - /// Some(vec![ + /// let expected = vec![Some(vec![ /// ScalarValue::List(Arc::new(l1)), /// ScalarValue::List(Arc::new(l2)), - /// ]), - /// ]; + /// ])]; /// /// assert_eq!(scalar_vec, expected); /// ``` /// /// Example 3: Nullable array /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::ListArray; /// use arrow::datatypes::{DataType, Int32Type}; + /// use datafusion_common::ScalarValue; /// /// let list_arr = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(1), Some(2), Some(3)]), - /// None, - /// Some(vec![Some(4), Some(5)]) + /// Some(vec![Some(1), Some(2), Some(3)]), + /// None, + /// Some(vec![Some(4), Some(5)]), /// ]); /// /// // Convert the array into Scalar Values for each row diff --git a/datafusion/common/src/scalar/struct_builder.rs b/datafusion/common/src/scalar/struct_builder.rs index fd19dccf8963..56daee904514 100644 --- a/datafusion/common/src/scalar/struct_builder.rs +++ b/datafusion/common/src/scalar/struct_builder.rs @@ -47,13 +47,11 @@ impl ScalarStructBuilder { /// ```rust /// # use arrow::datatypes::{DataType, Field}; /// # use datafusion_common::scalar::ScalarStructBuilder; - /// let fields = vec![ - /// Field::new("a", DataType::Int32, false), - /// ]; + /// let fields = vec![Field::new("a", DataType::Int32, false)]; /// let sv = ScalarStructBuilder::new_null(fields); /// // Note this is `NULL`, not `{a: NULL}` /// assert_eq!(format!("{sv}"), "NULL"); - ///``` + /// ``` /// /// To create a struct where the *fields* are null, use `Self::new()` and /// pass null values for each field: @@ -65,9 +63,9 @@ impl ScalarStructBuilder { /// let field = Field::new("a", DataType::Int32, true); /// // add a null value for the "a" field /// let sv = ScalarStructBuilder::new() - /// .with_scalar(field, ScalarValue::Int32(None)) - /// .build() - /// .unwrap(); + /// .with_scalar(field, ScalarValue::Int32(None)) + /// .build() + /// .unwrap(); /// // value is not null, but field is /// assert_eq!(format!("{sv}"), "{a:}"); /// ``` diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index 2481a88676ef..da298c20ebcb 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -520,33 +520,35 @@ impl Statistics { /// # use arrow::datatypes::{Field, Schema, DataType}; /// # use datafusion_common::stats::Precision; /// let stats1 = Statistics::default() - /// .with_num_rows(Precision::Exact(1)) - /// .with_total_byte_size(Precision::Exact(2)) - /// .add_column_statistics(ColumnStatistics::new_unknown() - /// .with_null_count(Precision::Exact(3)) - /// .with_min_value(Precision::Exact(ScalarValue::from(4))) - /// .with_max_value(Precision::Exact(ScalarValue::from(5))) - /// ); + /// .with_num_rows(Precision::Exact(1)) + /// .with_total_byte_size(Precision::Exact(2)) + /// .add_column_statistics( + /// ColumnStatistics::new_unknown() + /// .with_null_count(Precision::Exact(3)) + /// .with_min_value(Precision::Exact(ScalarValue::from(4))) + /// .with_max_value(Precision::Exact(ScalarValue::from(5))), + /// ); /// /// let stats2 = Statistics::default() - /// .with_num_rows(Precision::Exact(10)) - /// .with_total_byte_size(Precision::Inexact(20)) - /// .add_column_statistics(ColumnStatistics::new_unknown() - /// // absent null count - /// .with_min_value(Precision::Exact(ScalarValue::from(40))) - /// .with_max_value(Precision::Exact(ScalarValue::from(50))) - /// ); + /// .with_num_rows(Precision::Exact(10)) + /// .with_total_byte_size(Precision::Inexact(20)) + /// .add_column_statistics( + /// ColumnStatistics::new_unknown() + /// // absent null count + /// .with_min_value(Precision::Exact(ScalarValue::from(40))) + /// .with_max_value(Precision::Exact(ScalarValue::from(50))), + /// ); /// /// let merged_stats = stats1.try_merge(&stats2).unwrap(); /// let expected_stats = Statistics::default() - /// .with_num_rows(Precision::Exact(11)) - /// .with_total_byte_size(Precision::Inexact(22)) // inexact in stats2 --> inexact - /// .add_column_statistics( - /// ColumnStatistics::new_unknown() - /// .with_null_count(Precision::Absent) // missing from stats2 --> absent - /// .with_min_value(Precision::Exact(ScalarValue::from(4))) - /// .with_max_value(Precision::Exact(ScalarValue::from(50))) - /// ); + /// .with_num_rows(Precision::Exact(11)) + /// .with_total_byte_size(Precision::Inexact(22)) // inexact in stats2 --> inexact + /// .add_column_statistics( + /// ColumnStatistics::new_unknown() + /// .with_null_count(Precision::Absent) // missing from stats2 --> absent + /// .with_min_value(Precision::Exact(ScalarValue::from(4))) + /// .with_max_value(Precision::Exact(ScalarValue::from(50))), + /// ); /// /// assert_eq!(merged_stats, expected_stats) /// ``` diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index 574465856760..3163a8b16c8d 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -69,8 +69,11 @@ impl std::fmt::Display for ResolvedTableReference { /// /// // Get a table reference to 'myschema.mytable' (note the capitalization) /// let table_reference = TableReference::from("MySchema.MyTable"); -/// assert_eq!(table_reference, TableReference::partial("myschema", "mytable")); -///``` +/// assert_eq!( +/// table_reference, +/// TableReference::partial("myschema", "mytable") +/// ); +/// ``` #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum TableReference { /// An unqualified table reference, e.g. "table" @@ -247,7 +250,10 @@ impl TableReference { /// assert_eq!(table_reference.to_quoted_string(), "myschema.mytable"); /// /// let table_reference = TableReference::partial("MySchema", "MyTable"); - /// assert_eq!(table_reference.to_quoted_string(), r#""MySchema"."MyTable""#); + /// assert_eq!( + /// table_reference.to_quoted_string(), + /// r#""MySchema"."MyTable""# + /// ); /// ``` pub fn to_quoted_string(&self) -> String { match self { diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs index d97d4003e729..c51dea1c4de0 100644 --- a/datafusion/common/src/test_util.rs +++ b/datafusion/common/src/test_util.rs @@ -55,7 +55,7 @@ pub fn format_batches(results: &[RecordBatch]) -> Result i64 { 2 } /// let expr = orig_expr(); /// let ret = Transformed::no(expr.clone()) -/// .transform_data(|expr| { -/// // closure returns a result and potentially transforms the node -/// // in this example, it does transform the node -/// let new_expr = make_new_expr(expr); -/// Ok(Transformed::yes(new_expr)) -/// }).unwrap(); +/// .transform_data(|expr| { +/// // closure returns a result and potentially transforms the node +/// // in this example, it does transform the node +/// let new_expr = make_new_expr(expr); +/// Ok(Transformed::yes(new_expr)) +/// }) +/// .unwrap(); /// // transformed flag is the union of the original ans closure's transformed flag /// assert!(ret.transformed); /// ``` diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index eb7cf88e0075..674b1a41204d 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -67,12 +67,12 @@ pub type LogicalTypeRef = Arc; /// &NativeType::String /// } /// -/// fn signature(&self) -> TypeSignature<'_> { -/// TypeSignature::Extension { -/// name: "JSON", -/// parameters: &[], -/// } -/// } +/// fn signature(&self) -> TypeSignature<'_> { +/// TypeSignature::Extension { +/// name: "JSON", +/// parameters: &[], +/// } +/// } /// } /// ``` pub trait LogicalType: Sync + Send { diff --git a/datafusion/common/src/utils/memory.rs b/datafusion/common/src/utils/memory.rs index 29e523996cf4..a56b940fab66 100644 --- a/datafusion/common/src/utils/memory.rs +++ b/datafusion/common/src/utils/memory.rs @@ -56,8 +56,8 @@ use std::mem::size_of; /// impl MyStruct { /// fn size(&self) -> Result { /// let num_elements = self.values.len(); -/// let fixed_size = std::mem::size_of_val(self) + -/// std::mem::size_of_val(&self.values); +/// let fixed_size = +/// std::mem::size_of_val(self) + std::mem::size_of_val(&self.values); /// /// estimate_memory_size::(num_elements, fixed_size) /// } @@ -73,8 +73,8 @@ use std::mem::size_of; /// let num_rows = 100; /// let fixed_size = std::mem::size_of::>(); /// let estimated_hashtable_size = -/// estimate_memory_size::<(u64, u64)>(num_rows,fixed_size) -/// .expect("Size estimation failed"); +/// estimate_memory_size::<(u64, u64)>(num_rows, fixed_size) +/// .expect("Size estimation failed"); /// ``` pub fn estimate_memory_size(num_elements: usize, fixed_size: usize) -> Result { // For the majority of cases hashbrown overestimates the bucket quantity diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index 045c02a5a2aa..7b145ac3ae21 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -46,26 +46,23 @@ use std::thread::available_parallelism; /// /// Example: /// ``` -/// use arrow::datatypes::{SchemaRef, Schema, Field, DataType}; +/// use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; /// use datafusion_common::project_schema; /// /// // Schema with columns 'a', 'b', and 'c' /// let schema = SchemaRef::new(Schema::new(vec![ -/// Field::new("a", DataType::Int32, true), -/// Field::new("b", DataType::Int64, true), -/// Field::new("c", DataType::Utf8, true), +/// Field::new("a", DataType::Int32, true), +/// Field::new("b", DataType::Int64, true), +/// Field::new("c", DataType::Utf8, true), /// ])); /// /// // Pick columns 'c' and 'b' -/// let projection = Some(vec![2,1]); -/// let projected_schema = project_schema( -/// &schema, -/// projection.as_ref() -/// ).unwrap(); +/// let projection = Some(vec![2, 1]); +/// let projected_schema = project_schema(&schema, projection.as_ref()).unwrap(); /// /// let expected_schema = SchemaRef::new(Schema::new(vec![ -/// Field::new("c", DataType::Utf8, true), -/// Field::new("b", DataType::Int64, true), +/// Field::new("c", DataType::Utf8, true), +/// Field::new("b", DataType::Int64, true), /// ])); /// /// assert_eq!(projected_schema, expected_schema); @@ -398,9 +395,11 @@ pub fn longest_consecutive_prefix>( /// # use arrow::array::types::Int64Type; /// # use datafusion_common::utils::SingleRowListArrayBuilder; /// // Array is [1, 2, 3] -/// let arr = ListArray::from_iter_primitive::(vec![ -/// Some(vec![Some(1), Some(2), Some(3)]), -/// ]); +/// let arr = ListArray::from_iter_primitive::(vec![Some(vec![ +/// Some(1), +/// Some(2), +/// Some(3), +/// ])]); /// // Wrap as a list array: [[1, 2, 3]] /// let list_arr = SingleRowListArrayBuilder::new(Arc::new(arr)).build_list_array(); /// assert_eq!(list_arr.len(), 1); @@ -554,7 +553,8 @@ pub fn fixed_size_list_to_arrays(a: &ArrayRef) -> Vec { /// use datafusion_common::utils::base_type; /// use std::sync::Arc; /// -/// let data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); +/// let data_type = +/// DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); /// assert_eq!(base_type(&data_type), DataType::Int32); /// /// let data_type = DataType::Int32; @@ -906,16 +906,19 @@ pub fn get_available_parallelism() -> usize { /// # use datafusion_common::utils::take_function_args; /// # use datafusion_common::ScalarValue; /// fn my_function(args: &[ScalarValue]) -> Result<()> { -/// // function expects 2 args, so create a 2-element array -/// let [arg1, arg2] = take_function_args("my_function", args)?; -/// // ... do stuff.. -/// Ok(()) +/// // function expects 2 args, so create a 2-element array +/// let [arg1, arg2] = take_function_args("my_function", args)?; +/// // ... do stuff.. +/// Ok(()) /// } /// /// // Calling the function with 1 argument produces an error: /// let args = vec![ScalarValue::Int32(Some(10))]; /// let err = my_function(&args).unwrap_err(); -/// assert_eq!(err.to_string(), "Execution error: my_function function requires 2 arguments, got 1"); +/// assert_eq!( +/// err.to_string(), +/// "Execution error: my_function function requires 2 arguments, got 1" +/// ); /// // Calling the function with 2 arguments works great /// let args = vec![ScalarValue::Int32(Some(10)), ScalarValue::Int32(Some(20))]; /// my_function(&args).unwrap(); diff --git a/datafusion/common/src/utils/proxy.rs b/datafusion/common/src/utils/proxy.rs index d940677a5fb3..fb951aa3b028 100644 --- a/datafusion/common/src/utils/proxy.rs +++ b/datafusion/common/src/utils/proxy.rs @@ -47,7 +47,9 @@ pub trait VecAllocExt { /// assert_eq!(allocated, 16); // no new allocation needed /// /// // push more data into the vec - /// for _ in 0..10 { vec.push_accounted(1, &mut allocated); } + /// for _ in 0..10 { + /// vec.push_accounted(1, &mut allocated); + /// } /// assert_eq!(allocated, 64); // underlying vec has space for 10 u32s /// assert_eq!(vec.allocated_size(), 64); /// ``` @@ -82,7 +84,9 @@ pub trait VecAllocExt { /// assert_eq!(vec.allocated_size(), 16); // no new allocation needed /// /// // push more data into the vec - /// for _ in 0..10 { vec.push(1); } + /// for _ in 0..10 { + /// vec.push(1); + /// } /// assert_eq!(vec.allocated_size(), 64); // space for 64 now /// ``` fn allocated_size(&self) -> usize; @@ -133,7 +137,9 @@ pub trait RawTableAllocExt { /// assert_eq!(allocated, 64); /// /// // insert more values - /// for i in 0..100 { table.insert_accounted(i, hash_fn, &mut allocated); } + /// for i in 0..100 { + /// table.insert_accounted(i, hash_fn, &mut allocated); + /// } /// assert_eq!(allocated, 400); /// ``` fn insert_accounted( @@ -200,7 +206,9 @@ pub trait HashTableAllocExt { /// assert_eq!(allocated, 64); /// /// // insert more values - /// for i in 0..100 { table.insert_accounted(i, hash_fn, &mut allocated); } + /// for i in 0..100 { + /// table.insert_accounted(i, hash_fn, &mut allocated); + /// } /// assert_eq!(allocated, 400); /// ``` fn insert_accounted( diff --git a/datafusion/core/benches/parquet_query_sql.rs b/datafusion/core/benches/parquet_query_sql.rs index 14dcdf15f173..e2b381048013 100644 --- a/datafusion/core/benches/parquet_query_sql.rs +++ b/datafusion/core/benches/parquet_query_sql.rs @@ -166,11 +166,12 @@ fn generate_file() -> NamedTempFile { } let metadata = writer.close().unwrap(); + let file_metadata = metadata.file_metadata(); assert_eq!( - metadata.num_rows as usize, + file_metadata.num_rows() as usize, WRITE_RECORD_BATCH_SIZE * NUM_BATCHES ); - assert_eq!(metadata.row_groups.len(), EXPECTED_ROW_GROUPS); + assert_eq!(metadata.row_groups().len(), EXPECTED_ROW_GROUPS); println!( "Generated parquet file in {} seconds", diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 287a133273d8..3186c5cb8230 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -258,10 +258,13 @@ impl DataFrame { /// # async fn main() -> Result<()> { /// // datafusion will parse number as i64 first. /// let sql = "a > 1 and b in (1, 10)"; - /// let expected = col("a").gt(lit(1 as i64)) - /// .and(col("b").in_list(vec![lit(1 as i64), lit(10 as i64)], false)); + /// let expected = col("a") + /// .gt(lit(1 as i64)) + /// .and(col("b").in_list(vec![lit(1 as i64), lit(10 as i64)], false)); /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let expr = df.parse_sql_expr(sql)?; /// assert_eq!(expected, expr); /// # Ok(()) @@ -289,14 +292,16 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.select_columns(&["a", "b"])?; /// let expected = vec![ /// "+---+---+", /// "| a | b |", /// "+---+---+", /// "| 1 | 2 |", - /// "+---+---+" + /// "+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -329,8 +334,10 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let df : DataFrame = df.select_exprs(&["a * b", "c"])?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let df: DataFrame = df.select_exprs(&["a * b", "c"])?; /// # Ok(()) /// # } /// ``` @@ -357,14 +364,16 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.select(vec![col("a"), col("b") * col("c")])?; /// let expected = vec![ /// "+---+-----------------------+", /// "| a | ?table?.b * ?table?.c |", /// "+---+-----------------------+", /// "| 1 | 6 |", - /// "+---+-----------------------+" + /// "+---+-----------------------+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -407,7 +416,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// // +----+----+----+ /// // | a | b | c | /// // +----+----+----+ @@ -419,7 +430,7 @@ impl DataFrame { /// "| b | c |", /// "+---+---+", /// "| 2 | 3 |", - /// "+---+---+" + /// "+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -518,7 +529,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.filter(col("a").lt_eq(col("b")))?; /// // all rows where a <= b are returned /// let expected = vec![ @@ -528,7 +541,7 @@ impl DataFrame { /// "| 1 | 2 | 3 |", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -557,7 +570,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// /// // The following use is the equivalent of "SELECT MIN(b) GROUP BY a" /// let df1 = df.clone().aggregate(vec![col("a")], vec![min(col("b"))])?; @@ -568,7 +583,7 @@ impl DataFrame { /// "| 1 | 2 |", /// "| 4 | 5 |", /// "| 7 | 8 |", - /// "+---+----------------+" + /// "+---+----------------+", /// ]; /// assert_batches_sorted_eq!(expected1, &df1.collect().await?); /// // The following use is the equivalent of "SELECT MIN(b)" @@ -578,7 +593,7 @@ impl DataFrame { /// "| min(?table?.b) |", /// "+----------------+", /// "| 2 |", - /// "+----------------+" + /// "+----------------+", /// ]; /// # assert_batches_sorted_eq!(expected2, &df2.collect().await?); /// # Ok(()) @@ -646,7 +661,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.limit(1, Some(2))?; /// let expected = vec![ /// "+---+---+---+", @@ -654,7 +671,7 @@ impl DataFrame { /// "+---+---+---+", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -683,7 +700,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? ; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let d2 = df.clone(); /// let df = df.union(d2)?; /// let expected = vec![ @@ -692,7 +711,7 @@ impl DataFrame { /// "+---+---+---+", /// "| 1 | 2 | 3 |", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -723,8 +742,13 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let d2 = df.clone().select_columns(&["b", "c", "a"])?.with_column("d", lit("77"))?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = df + /// .clone() + /// .select_columns(&["b", "c", "a"])? + /// .with_column("d", lit("77"))?; /// let df = df.union_by_name(d2)?; /// let expected = vec![ /// "+---+---+---+----+", @@ -732,7 +756,7 @@ impl DataFrame { /// "+---+---+---+----+", /// "| 1 | 2 | 3 | |", /// "| 1 | 2 | 3 | 77 |", - /// "+---+---+---+----+" + /// "+---+---+---+----+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -762,7 +786,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let d2 = df.clone(); /// let df = df.union_distinct(d2)?; /// // df2 are duplicate of df @@ -771,7 +797,7 @@ impl DataFrame { /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -802,7 +828,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let d2 = df.clone().select_columns(&["b", "c", "a"])?; /// let df = df.union_by_name_distinct(d2)?; /// let expected = vec![ @@ -810,7 +838,7 @@ impl DataFrame { /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -837,14 +865,16 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.distinct()?; /// let expected = vec![ /// "+---+---+---+", /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -871,15 +901,17 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// // Return a single row (a, b) for each distinct value of a - /// .distinct_on(vec![col("a")], vec![col("a"), col("b")], None)?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// // Return a single row (a, b) for each distinct value of a + /// .distinct_on(vec![col("a")], vec![col("a"), col("b")], None)?; /// let expected = vec![ /// "+---+---+", /// "| a | b |", /// "+---+---+", /// "| 1 | 2 |", - /// "+---+---+" + /// "+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -1125,11 +1157,13 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.sort(vec![ - /// col("a").sort(false, true), // a DESC, nulls first - /// col("b").sort(true, false), // b ASC, nulls last - /// ])?; + /// col("a").sort(false, true), // a DESC, nulls first + /// col("b").sort(true, false), // b ASC, nulls last + /// ])?; /// let expected = vec![ /// "+---+---+---+", /// "| a | b | c |", @@ -1176,12 +1210,17 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let left = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let right = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// .select(vec![ - /// col("a").alias("a2"), - /// col("b").alias("b2"), - /// col("c").alias("c2")])?; + /// let left = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let right = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// .select(vec![ + /// col("a").alias("a2"), + /// col("b").alias("b2"), + /// col("c").alias("c2"), + /// ])?; /// // Perform the equivalent of `left INNER JOIN right ON (a = a2 AND b = b2)` /// // finding all pairs of rows from `left` and `right` where `a = a2` and `b = b2`. /// let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", "b2"], None)?; @@ -1190,13 +1229,12 @@ impl DataFrame { /// "| a | b | c | a2 | b2 | c2 |", /// "+---+---+---+----+----+----+", /// "| 1 | 2 | 3 | 1 | 2 | 3 |", - /// "+---+---+---+----+----+----+" + /// "+---+---+---+----+----+----+", /// ]; /// assert_batches_sorted_eq!(expected, &join.collect().await?); /// # Ok(()) /// # } /// ``` - /// pub fn join( self, right: DataFrame, @@ -1258,7 +1296,7 @@ impl DataFrame { /// "+---+---+---+----+----+----+", /// "| a | b | c | a2 | b2 | c2 |", /// "+---+---+---+----+----+----+", - /// "+---+---+---+----+----+----+" + /// "+---+---+---+----+----+----+", /// ]; /// # assert_batches_sorted_eq!(expected, &join_on.collect().await?); /// # Ok(()) @@ -1290,7 +1328,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df1 = df.repartition(Partitioning::RoundRobinBatch(4))?; /// let expected = vec![ /// "+---+---+---+", @@ -1299,7 +1339,7 @@ impl DataFrame { /// "| 1 | 2 | 3 |", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df1.collect().await?); /// # Ok(()) @@ -1328,7 +1368,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let count = df.count().await?; // 1 /// # assert_eq!(count, 1); /// # Ok(()) @@ -1367,7 +1409,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let batches = df.collect().await?; /// # Ok(()) /// # } @@ -1387,7 +1431,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// df.show().await?; /// # Ok(()) /// # } @@ -1446,7 +1492,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// df.show_limit(10).await?; /// # Ok(()) /// # } @@ -1472,7 +1520,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let stream = df.execute_stream().await?; /// # Ok(()) /// # } @@ -1498,7 +1548,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let batches = df.collect_partitioned().await?; /// # Ok(()) /// # } @@ -1518,7 +1570,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let batches = df.execute_stream_partitioned().await?; /// # Ok(()) /// # } @@ -1547,7 +1601,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let schema = df.schema(); /// # Ok(()) /// # } @@ -1613,8 +1669,14 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let batches = df.limit(0, Some(100))?.explain(false, false)?.collect().await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let batches = df + /// .limit(0, Some(100))? + /// .explain(false, false)? + /// .collect() + /// .await?; /// # Ok(()) /// # } /// ``` @@ -1637,8 +1699,18 @@ impl DataFrame { /// # async fn main() -> Result<()> { /// use datafusion_expr::{Explain, ExplainOption}; /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let batches = df.limit(0, Some(100))?.explain_with_options(ExplainOption::default().with_verbose(false).with_analyze(false))?.collect().await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let batches = df + /// .limit(0, Some(100))? + /// .explain_with_options( + /// ExplainOption::default() + /// .with_verbose(false) + /// .with_analyze(false), + /// )? + /// .collect() + /// .await?; /// # Ok(()) /// # } /// ``` @@ -1668,7 +1740,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let f = df.registry(); /// // use f.udf("name", vec![...]) to use the udf /// # Ok(()) @@ -1687,15 +1761,19 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let d2 = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.intersect(d2)?; /// let expected = vec![ /// "+---+---+---+", /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -1721,15 +1799,19 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let d2 = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.intersect_distinct(d2)?; /// let expected = vec![ /// "+---+---+---+", /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -1755,8 +1837,12 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; - /// let d2 = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let result = df.except(d2)?; /// // those columns are not in example.csv, but in example_long.csv /// let expected = vec![ @@ -1765,7 +1851,7 @@ impl DataFrame { /// "+---+---+---+", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &result.collect().await?); /// # Ok(()) @@ -1791,8 +1877,12 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; - /// let d2 = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let result = df.except_distinct(d2)?; /// // those columns are not in example.csv, but in example_long.csv /// let expected = vec![ @@ -1801,7 +1891,7 @@ impl DataFrame { /// "+---+---+---+", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &result.collect().await?); /// # Ok(()) @@ -1878,13 +1968,15 @@ impl DataFrame { /// use datafusion::dataframe::DataFrameWriteOptions; /// let ctx = SessionContext::new(); /// // Sort the data by column "b" and write it to a new location - /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first - /// .write_csv( - /// "output.csv", - /// DataFrameWriteOptions::new(), - /// None, // can also specify CSV writing options here - /// ).await?; + /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first + /// .write_csv( + /// "output.csv", + /// DataFrameWriteOptions::new(), + /// None, // can also specify CSV writing options here + /// ) + /// .await?; /// # fs::remove_file("output.csv")?; /// # Ok(()) /// # } @@ -1948,13 +2040,11 @@ impl DataFrame { /// use datafusion::dataframe::DataFrameWriteOptions; /// let ctx = SessionContext::new(); /// // Sort the data by column "b" and write it to a new location - /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first - /// .write_json( - /// "output.json", - /// DataFrameWriteOptions::new(), - /// None - /// ).await?; + /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first + /// .write_json("output.json", DataFrameWriteOptions::new(), None) + /// .await?; /// # fs::remove_file("output.json")?; /// # Ok(()) /// # } @@ -2015,7 +2105,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.with_column("ab_sum", col("a") + col("b"))?; /// # Ok(()) /// # } @@ -2089,7 +2181,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.with_column_renamed("ab_sum", "total")?; /// /// # Ok(()) @@ -2222,7 +2316,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.cache().await?; /// # Ok(()) /// # } @@ -2266,7 +2362,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// // Fill nulls in only columns "a" and "c": /// let df = df.fill_null(ScalarValue::from(0), vec!["a".to_owned(), "c".to_owned()])?; /// // Fill nulls across all columns: @@ -2337,9 +2435,9 @@ impl DataFrame { /// Helper for creating DataFrame. /// # Example /// ``` - /// use std::sync::Arc; /// use arrow::array::{ArrayRef, Int32Array, StringArray}; /// use datafusion::prelude::DataFrame; + /// use std::sync::Arc; /// let id: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); /// let name: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz"])); /// let df = DataFrame::from_columns(vec![("id", id), ("name", name)]).unwrap(); diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index d46a902ca513..cb8a6cf29541 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -42,13 +42,15 @@ impl DataFrame { /// use datafusion::dataframe::DataFrameWriteOptions; /// let ctx = SessionContext::new(); /// // Sort the data by column "b" and write it to a new location - /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first - /// .write_parquet( - /// "output.parquet", - /// DataFrameWriteOptions::new(), - /// None, // can also specify parquet writing options here - /// ).await?; + /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first + /// .write_parquet( + /// "output.parquet", + /// DataFrameWriteOptions::new(), + /// None, // can also specify parquet writing options here + /// ) + /// .await?; /// # fs::remove_file("output.parquet")?; /// # Ok(()) /// # } @@ -116,6 +118,8 @@ mod tests { use datafusion_execution::config::SessionConfig; use datafusion_expr::{col, lit}; + #[cfg(feature = "parquet_encryption")] + use datafusion_common::config::ConfigFileEncryptionProperties; use object_store::local::LocalFileSystem; use parquet::file::reader::FileReader; use tempfile::TempDir; @@ -280,7 +284,8 @@ mod tests { // Write encrypted parquet using write_parquet let mut options = TableParquetOptions::default(); - options.crypto.file_encryption = Some((&encrypt).into()); + options.crypto.file_encryption = + Some(ConfigFileEncryptionProperties::from(&encrypt)); options.global.allow_single_file_parallelism = allow_single_file_parallelism; df.write_parquet( diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index e165707c2eb0..4881783eeba6 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -90,7 +90,7 @@ pub(crate) mod test_util { ) .with_file_groups(file_groups) .with_statistics(statistics) - .with_projection(projection) + .with_projection_indices(projection) .with_limit(limit) .build(), ) diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs index 8c1bb02ef073..e78c5f09553c 100644 --- a/datafusion/core/src/datasource/file_format/options.rs +++ b/datafusion/core/src/datasource/file_format/options.rs @@ -269,6 +269,8 @@ pub struct ParquetReadOptions<'a> { pub file_sort_order: Vec>, /// Properties for decryption of Parquet files that use modular encryption pub file_decryption_properties: Option, + /// Metadata size hint for Parquet files reading (in bytes) + pub metadata_size_hint: Option, } impl Default for ParquetReadOptions<'_> { @@ -281,6 +283,7 @@ impl Default for ParquetReadOptions<'_> { schema: None, file_sort_order: vec![], file_decryption_properties: None, + metadata_size_hint: None, } } } @@ -340,6 +343,12 @@ impl<'a> ParquetReadOptions<'a> { self.file_decryption_properties = Some(file_decryption_properties); self } + + /// Configure metadata size hint for Parquet files reading (in bytes) + pub fn metadata_size_hint(mut self, size_hint: Option) -> Self { + self.metadata_size_hint = size_hint; + self + } } /// Options that control the reading of ARROW files. @@ -606,6 +615,11 @@ impl ReadOptions<'_> for ParquetReadOptions<'_> { if let Some(file_decryption_properties) = &self.file_decryption_properties { options.crypto.file_decryption = Some(file_decryption_properties.clone()); } + // This can be overridden per-read in ParquetReadOptions, if setting. + if let Some(metadata_size_hint) = self.metadata_size_hint { + options.global.metadata_size_hint = Some(metadata_size_hint); + } + let mut file_format = ParquetFormat::new().with_options(options); if let Some(parquet_pruning) = self.parquet_pruning { diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 088c4408fff5..52c5393e1031 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -154,7 +154,6 @@ mod tests { use futures::stream::BoxStream; use futures::StreamExt; use insta::assert_snapshot; - use log::error; use object_store::local::LocalFileSystem; use object_store::ObjectMeta; use object_store::{ @@ -163,9 +162,10 @@ mod tests { }; use parquet::arrow::arrow_reader::ArrowReaderOptions; use parquet::arrow::ParquetRecordBatchStreamBuilder; - use parquet::file::metadata::{KeyValue, ParquetColumnIndex, ParquetOffsetIndex}; - use parquet::file::page_index::index::Index; - use parquet::format::FileMetaData; + use parquet::file::metadata::{ + KeyValue, ParquetColumnIndex, ParquetMetaData, ParquetOffsetIndex, + }; + use parquet::file::page_index::column_index::ColumnIndexMetaData; use tokio::fs::File; enum ForceViews { @@ -546,7 +546,8 @@ mod tests { let (files, _file_names) = store_parquet(vec![batch1], false).await?; let state = SessionContext::new().state(); - let format = ParquetFormat::default(); + // Make metadata size hint None to keep original behavior + let format = ParquetFormat::default().with_metadata_size_hint(None); let _schema = format.infer_schema(&state, &store.upcast(), &files).await?; assert_eq!(store.request_count(), 3); // No increase, cache being used. @@ -620,7 +621,9 @@ mod tests { let mut state = SessionContext::new().state(); state = set_view_state(state, force_views); - let format = ParquetFormat::default().with_force_view_types(force_views); + let format = ParquetFormat::default() + .with_force_view_types(force_views) + .with_metadata_size_hint(None); let schema = format.infer_schema(&state, &store.upcast(), &files).await?; assert_eq!(store.request_count(), 6); @@ -1144,18 +1147,14 @@ mod tests { // 325 pages in int_col assert_eq!(int_col_offset.len(), 325); - match int_col_index { - Index::INT32(index) => { - assert_eq!(index.indexes.len(), 325); - for min_max in index.clone().indexes { - assert!(min_max.min.is_some()); - assert!(min_max.max.is_some()); - assert!(min_max.null_count.is_some()); - } - } - _ => { - error!("fail to read page index.") - } + let ColumnIndexMetaData::INT32(index) = int_col_index else { + panic!("fail to read page index.") + }; + assert_eq!(index.min_values().len(), 325); + assert_eq!(index.max_values().len(), 325); + // all values are non null + for idx in 0..325 { + assert_eq!(index.null_count(idx), Some(0)); } } @@ -1556,7 +1555,7 @@ mod tests { Ok(parquet_sink) } - fn get_written(parquet_sink: Arc) -> Result<(Path, FileMetaData)> { + fn get_written(parquet_sink: Arc) -> Result<(Path, ParquetMetaData)> { let mut written = parquet_sink.written(); let written = written.drain(); assert_eq!( @@ -1566,28 +1565,33 @@ mod tests { written.len() ); - let (path, file_metadata) = written.take(1).next().unwrap(); - Ok((path, file_metadata)) + let (path, parquet_meta_data) = written.take(1).next().unwrap(); + Ok((path, parquet_meta_data)) } - fn assert_file_metadata(file_metadata: FileMetaData, expected_kv: &Vec) { - let FileMetaData { - num_rows, - schema, - key_value_metadata, - .. - } = file_metadata; - assert_eq!(num_rows, 2, "file metadata to have 2 rows"); + fn assert_file_metadata( + parquet_meta_data: ParquetMetaData, + expected_kv: &Vec, + ) { + let file_metadata = parquet_meta_data.file_metadata(); + let schema_descr = file_metadata.schema_descr(); + assert_eq!(file_metadata.num_rows(), 2, "file metadata to have 2 rows"); assert!( - schema.iter().any(|col_schema| col_schema.name == "a"), + schema_descr + .columns() + .iter() + .any(|col_schema| col_schema.name() == "a"), "output file metadata should contain col a" ); assert!( - schema.iter().any(|col_schema| col_schema.name == "b"), + schema_descr + .columns() + .iter() + .any(|col_schema| col_schema.name() == "b"), "output file metadata should contain col b" ); - let mut key_value_metadata = key_value_metadata.unwrap(); + let mut key_value_metadata = file_metadata.key_value_metadata().unwrap().clone(); key_value_metadata.sort_by(|a, b| a.key.cmp(&b.key)); assert_eq!(&key_value_metadata, expected_kv); } @@ -1644,13 +1648,11 @@ mod tests { // check the file metadata includes partitions let mut expected_partitions = std::collections::HashSet::from(["a=foo", "a=bar"]); - for ( - path, - FileMetaData { - num_rows, schema, .. - }, - ) in written.take(2) - { + for (path, parquet_metadata) in written.take(2) { + let file_metadata = parquet_metadata.file_metadata(); + let schema = file_metadata.schema_descr(); + let num_rows = file_metadata.num_rows(); + let path_parts = path.parts().collect::>(); assert_eq!(path_parts.len(), 2, "should have path prefix"); @@ -1663,11 +1665,17 @@ mod tests { assert_eq!(num_rows, 1, "file metadata to have 1 row"); assert!( - !schema.iter().any(|col_schema| col_schema.name == "a"), + !schema + .columns() + .iter() + .any(|col_schema| col_schema.name() == "a"), "output file metadata will not contain partitioned col a" ); assert!( - schema.iter().any(|col_schema| col_schema.name == "b"), + schema + .columns() + .iter() + .any(|col_schema| col_schema.name() == "b"), "output file metadata should contain col b" ); } diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 94d651ddadd5..37b9663111a5 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -45,6 +45,7 @@ pub use datafusion_catalog::view; pub use datafusion_datasource::schema_adapter; pub use datafusion_datasource::sink; pub use datafusion_datasource::source; +pub use datafusion_datasource::table_schema; pub use datafusion_execution::object_store; pub use datafusion_physical_expr::create_ordering; diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 8a00af959ccc..9068c9758179 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -88,7 +88,7 @@ mod tests { source, ) .with_file(meta.into()) - .with_projection(Some(vec![0, 1, 2])) + .with_projection_indices(Some(vec![0, 1, 2])) .build(); let source_exec = DataSourceExec::from_data_source(conf); @@ -160,7 +160,7 @@ mod tests { let source = Arc::new(AvroSource::new()); let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) .with_file(meta.into()) - .with_projection(projection) + .with_projection_indices(projection) .build(); let source_exec = DataSourceExec::from_data_source(conf); @@ -231,7 +231,7 @@ mod tests { let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) // select specific columns of the files as well as the partitioning // column which is supposed to be the last column in the table schema. - .with_projection(projection) + .with_projection_indices(projection) .with_file(partitioned_file) .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]) .build(); diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index b2ef51a76f89..4f46a57d8b13 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -118,7 +118,7 @@ mod tests { )) .with_file_compression_type(file_compression_type) .with_newlines_in_values(false) - .with_projection(Some(vec![0, 2, 4])) + .with_projection_indices(Some(vec![0, 2, 4])) .build(); assert_eq!(13, config.file_schema().fields().len()); @@ -183,7 +183,7 @@ mod tests { )) .with_newlines_in_values(false) .with_file_compression_type(file_compression_type.to_owned()) - .with_projection(Some(vec![4, 0, 2])) + .with_projection_indices(Some(vec![4, 0, 2])) .build(); assert_eq!(13, config.file_schema().fields().len()); let csv = DataSourceExec::from_data_source(config); @@ -373,7 +373,7 @@ mod tests { .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]) // We should be able to project on the partition column // Which is supposed to be after the file fields - .with_projection(Some(vec![0, num_file_schema_fields])) + .with_projection_indices(Some(vec![0, num_file_schema_fields])) .build(); // we don't have `/date=xx/` in the path but that is ok because diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 0d45711c76fb..f7d5c710bf48 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -297,7 +297,7 @@ mod tests { let source = Arc::new(JsonSource::new()); let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) .with_file_groups(file_groups) - .with_projection(Some(vec![0, 2])) + .with_projection_indices(Some(vec![0, 2])) .with_file_compression_type(file_compression_type.to_owned()) .build(); let exec = DataSourceExec::from_data_source(conf); @@ -345,7 +345,7 @@ mod tests { let source = Arc::new(JsonSource::new()); let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) .with_file_groups(file_groups) - .with_projection(Some(vec![3, 0, 2])) + .with_projection_indices(Some(vec![3, 0, 2])) .with_file_compression_type(file_compression_type.to_owned()) .build(); let exec = DataSourceExec::from_data_source(conf); diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 10a475c1cc9a..18b855cec55e 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -54,7 +54,7 @@ mod tests { use datafusion_datasource::source::DataSourceExec; use datafusion_datasource::file::FileSource; - use datafusion_datasource::{FileRange, PartitionedFile}; + use datafusion_datasource::{FileRange, PartitionedFile, TableSchema}; use datafusion_datasource_parquet::source::ParquetSource; use datafusion_datasource_parquet::{ DefaultParquetFileReaderFactory, ParquetFileReaderFactory, ParquetFormat, @@ -186,7 +186,7 @@ mod tests { source = source.with_bloom_filter_on_read(false); } - source.with_schema(Arc::clone(&table_schema)) + source.with_schema(TableSchema::new(Arc::clone(&table_schema), vec![])) } fn build_parquet_exec( @@ -201,7 +201,7 @@ mod tests { source, ) .with_file_group(file_group) - .with_projection(self.projection.clone()) + .with_projection_indices(self.projection.clone()) .build(); DataSourceExec::from_data_source(base_config) } @@ -1655,7 +1655,7 @@ mod tests { let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source) .with_file(partitioned_file) // file has 10 cols so index 12 should be month and 13 should be day - .with_projection(Some(vec![0, 1, 2, 12, 13])) + .with_projection_indices(Some(vec![0, 1, 2, 12, 13])) .with_table_partition_cols(vec![ Field::new("year", DataType::Utf8, false), Field::new("month", DataType::UInt8, false), diff --git a/datafusion/core/src/execution/context/csv.rs b/datafusion/core/src/execution/context/csv.rs index 15d6d21f038a..e6f95886e91d 100644 --- a/datafusion/core/src/execution/context/csv.rs +++ b/datafusion/core/src/execution/context/csv.rs @@ -37,9 +37,16 @@ impl SessionContext { /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// // You can read a single file using `read_csv` - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// // you can also read multiple files: - /// let df = ctx.read_csv(vec!["tests/data/example.csv", "tests/data/example.csv"], CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv( + /// vec!["tests/data/example.csv", "tests/data/example.csv"], + /// CsvReadOptions::new(), + /// ) + /// .await?; /// # Ok(()) /// # } /// ``` diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 87dc18be5b83..687779787ab5 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -64,12 +64,13 @@ use datafusion_catalog::{ DynamicFileCatalog, TableFunction, TableFunctionImpl, UrlTableFactory, }; use datafusion_common::config::ConfigOptions; +use datafusion_common::metadata::ScalarAndMetadata; use datafusion_common::{ config::{ConfigExtension, TableOptions}, exec_datafusion_err, exec_err, internal_datafusion_err, not_impl_err, plan_datafusion_err, plan_err, tree_node::{TreeNodeRecursion, TreeNodeVisitor}, - DFSchema, DataFusionError, ParamValues, ScalarValue, SchemaReference, TableReference, + DFSchema, DataFusionError, ParamValues, SchemaReference, TableReference, }; pub use datafusion_execution::config::SessionConfig; use datafusion_execution::registry::SerializerRegistry; @@ -165,22 +166,23 @@ where /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); -/// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; -/// let df = df.filter(col("a").lt_eq(col("b")))? -/// .aggregate(vec![col("a")], vec![min(col("b"))])? -/// .limit(0, Some(100))?; -/// let results = df -/// .collect() -/// .await?; +/// let df = ctx +/// .read_csv("tests/data/example.csv", CsvReadOptions::new()) +/// .await?; +/// let df = df +/// .filter(col("a").lt_eq(col("b")))? +/// .aggregate(vec![col("a")], vec![min(col("b"))])? +/// .limit(0, Some(100))?; +/// let results = df.collect().await?; /// assert_batches_eq!( -/// &[ -/// "+---+----------------+", -/// "| a | min(?table?.b) |", -/// "+---+----------------+", -/// "| 1 | 2 |", -/// "+---+----------------+", -/// ], -/// &results +/// &[ +/// "+---+----------------+", +/// "| a | min(?table?.b) |", +/// "+---+----------------+", +/// "| 1 | 2 |", +/// "+---+----------------+", +/// ], +/// &results /// ); /// # Ok(()) /// # } @@ -196,21 +198,22 @@ where /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); -/// ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?; +/// ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()) +/// .await?; /// let results = ctx -/// .sql("SELECT a, min(b) FROM example GROUP BY a LIMIT 100") -/// .await? -/// .collect() -/// .await?; +/// .sql("SELECT a, min(b) FROM example GROUP BY a LIMIT 100") +/// .await? +/// .collect() +/// .await?; /// assert_batches_eq!( -/// &[ -/// "+---+----------------+", -/// "| a | min(example.b) |", -/// "+---+----------------+", -/// "| 1 | 2 |", -/// "+---+----------------+", -/// ], -/// &results +/// &[ +/// "+---+----------------+", +/// "| a | min(example.b) |", +/// "+---+----------------+", +/// "| 1 | 2 |", +/// "+---+----------------+", +/// ], +/// &results /// ); /// # Ok(()) /// # } @@ -230,18 +233,18 @@ where /// let config = SessionConfig::new().with_batch_size(4 * 1024); /// /// // configure a memory limit of 1GB with 20% slop -/// let runtime_env = RuntimeEnvBuilder::new() +/// let runtime_env = RuntimeEnvBuilder::new() /// .with_memory_limit(1024 * 1024 * 1024, 0.80) /// .build_arc() /// .unwrap(); /// /// // Create a SessionState using the config and runtime_env /// let state = SessionStateBuilder::new() -/// .with_config(config) -/// .with_runtime_env(runtime_env) -/// // include support for built in functions and configurations -/// .with_default_features() -/// .build(); +/// .with_config(config) +/// .with_runtime_env(runtime_env) +/// // include support for built in functions and configurations +/// .with_default_features() +/// .build(); /// /// // Create a SessionContext /// let ctx = SessionContext::from(state); @@ -427,16 +430,14 @@ impl SessionContext { /// # use datafusion::prelude::*; /// # use datafusion::execution::SessionStateBuilder; /// # use datafusion_optimizer::push_down_filter::PushDownFilter; - /// let my_rule = PushDownFilter{}; // pretend it is a new rule - /// // Create a new builder with a custom optimizer rule + /// let my_rule = PushDownFilter {}; // pretend it is a new rule + /// // Create a new builder with a custom optimizer rule /// let context: SessionContext = SessionStateBuilder::new() - /// .with_optimizer_rule(Arc::new(my_rule)) - /// .build() - /// .into(); + /// .with_optimizer_rule(Arc::new(my_rule)) + /// .build() + /// .into(); /// // Enable local file access and convert context back to a builder - /// let builder = context - /// .enable_url_table() - /// .into_state_builder(); + /// let builder = context.enable_url_table().into_state_builder(); /// ``` pub fn into_state_builder(self) -> SessionStateBuilder { let SessionContext { @@ -584,11 +585,10 @@ impl SessionContext { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// ctx - /// .sql("CREATE TABLE foo (x INTEGER)") - /// .await? - /// .collect() - /// .await?; + /// ctx.sql("CREATE TABLE foo (x INTEGER)") + /// .await? + /// .collect() + /// .await?; /// assert!(ctx.table_exist("foo").unwrap()); /// # Ok(()) /// # } @@ -613,14 +613,14 @@ impl SessionContext { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let options = SQLOptions::new() - /// .with_allow_ddl(false); - /// let err = ctx.sql_with_options("CREATE TABLE foo (x INTEGER)", options) - /// .await - /// .unwrap_err(); - /// assert!( - /// err.to_string().starts_with("Error during planning: DDL not supported: CreateMemoryTable") - /// ); + /// let options = SQLOptions::new().with_allow_ddl(false); + /// let err = ctx + /// .sql_with_options("CREATE TABLE foo (x INTEGER)", options) + /// .await + /// .unwrap_err(); + /// assert!(err + /// .to_string() + /// .starts_with("Error during planning: DDL not supported: CreateMemoryTable")); /// # Ok(()) /// # } /// ``` @@ -652,8 +652,7 @@ impl SessionContext { /// // provide type information that `a` is an Int32 /// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); /// let df_schema = DFSchema::try_from(schema).unwrap(); - /// let expr = SessionContext::new() - /// .parse_sql_expr(sql, &df_schema)?; + /// let expr = SessionContext::new().parse_sql_expr(sql, &df_schema)?; /// assert_eq!(expected, expr); /// # Ok(()) /// # } @@ -715,15 +714,15 @@ impl SessionContext { LogicalPlan::Statement(Statement::Prepare(Prepare { name, input, - data_types, + fields, })) => { // The number of parameters must match the specified data types length. - if !data_types.is_empty() { + if !fields.is_empty() { let param_names = input.get_parameter_names()?; - if param_names.len() != data_types.len() { + if param_names.len() != fields.len() { return plan_err!( "Prepare specifies {} data types but query has {} parameters", - data_types.len(), + fields.len(), param_names.len() ); } @@ -733,7 +732,7 @@ impl SessionContext { // not currently feasible. This is because `now()` would be optimized to a // constant value, causing each EXECUTE to yield the same result, which is // incorrect behavior. - self.state.write().store_prepared(name, data_types, input)?; + self.state.write().store_prepared(name, fields, input)?; self.return_empty_dataframe() } LogicalPlan::Statement(Statement::Execute(execute)) => { @@ -1142,8 +1141,14 @@ impl SessionContext { /// ``` /// use datafusion::execution::context::SessionContext; /// - /// assert_eq!(SessionContext::parse_memory_limit("1M").unwrap(), 1024 * 1024); - /// assert_eq!(SessionContext::parse_memory_limit("1.5G").unwrap(), (1.5 * 1024.0 * 1024.0 * 1024.0) as usize); + /// assert_eq!( + /// SessionContext::parse_memory_limit("1M").unwrap(), + /// 1024 * 1024 + /// ); + /// assert_eq!( + /// SessionContext::parse_memory_limit("1.5G").unwrap(), + /// (1.5 * 1024.0 * 1024.0 * 1024.0) as usize + /// ); /// ``` pub fn parse_memory_limit(limit: &str) -> Result { let (number, unit) = limit.split_at(limit.len() - 1); @@ -1265,28 +1270,30 @@ impl SessionContext { })?; // Only allow literals as parameters for now. - let mut params: Vec = parameters + let mut params: Vec = parameters .into_iter() .map(|e| match e { - Expr::Literal(scalar, _) => Ok(scalar), + Expr::Literal(scalar, metadata) => { + Ok(ScalarAndMetadata::new(scalar, metadata)) + } _ => not_impl_err!("Unsupported parameter type: {}", e), }) .collect::>()?; // If the prepared statement provides data types, cast the params to those types. - if !prepared.data_types.is_empty() { - if params.len() != prepared.data_types.len() { + if !prepared.fields.is_empty() { + if params.len() != prepared.fields.len() { return exec_err!( "Prepared statement '{}' expects {} parameters, but {} provided", name, - prepared.data_types.len(), + prepared.fields.len(), params.len() ); } params = params .into_iter() - .zip(prepared.data_types.iter()) - .map(|(e, dt)| e.cast_to(dt)) + .zip(prepared.fields.iter()) + .map(|(e, dt)| -> Result<_> { e.cast_storage_to(dt.data_type()) }) .collect::>()?; } diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index b3b336f5605c..2949b17537d9 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -30,7 +30,7 @@ use crate::datasource::provider_as_source; use crate::execution::context::{EmptySerializerRegistry, FunctionFactory, QueryPlanner}; use crate::execution::SessionStateDefaults; use crate::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}; -use arrow::datatypes::DataType; +use arrow_schema::{DataType, FieldRef}; use datafusion_catalog::information_schema::{ InformationSchemaProvider, INFORMATION_SCHEMA, }; @@ -114,12 +114,12 @@ use uuid::Uuid; /// # use std::sync::Arc; /// # #[tokio::main] /// # async fn main() -> Result<()> { -/// let state = SessionStateBuilder::new() -/// .with_config(SessionConfig::new()) -/// .with_runtime_env(Arc::new(RuntimeEnv::default())) -/// .with_default_features() -/// .build(); -/// Ok(()) +/// let state = SessionStateBuilder::new() +/// .with_config(SessionConfig::new()) +/// .with_runtime_env(Arc::new(RuntimeEnv::default())) +/// .with_default_features() +/// .build(); +/// Ok(()) /// # } /// ``` /// @@ -872,12 +872,12 @@ impl SessionState { pub(crate) fn store_prepared( &mut self, name: String, - data_types: Vec, + fields: Vec, plan: Arc, ) -> datafusion_common::Result<()> { match self.prepared_plans.entry(name) { Entry::Vacant(e) => { - e.insert(Arc::new(PreparedPlan { data_types, plan })); + e.insert(Arc::new(PreparedPlan { fields, plan })); Ok(()) } Entry::Occupied(e) => { @@ -1322,7 +1322,7 @@ impl SessionStateBuilder { /// let url = Url::try_from("file://").unwrap(); /// let object_store = object_store::local::LocalFileSystem::new(); /// let state = SessionStateBuilder::new() - /// .with_config(SessionConfig::new()) + /// .with_config(SessionConfig::new()) /// .with_object_store(&url, Arc::new(object_store)) /// .with_default_features() /// .build(); @@ -2030,7 +2030,7 @@ impl SimplifyInfo for SessionSimplifyProvider<'_> { #[derive(Debug)] pub(crate) struct PreparedPlan { /// Data types of the parameters - pub(crate) data_types: Vec, + pub(crate) fields: Vec, /// The prepared logical plan pub(crate) plan: Arc, } diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index e7ace544a11c..381dd5e9e848 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -86,26 +86,29 @@ //! let ctx = SessionContext::new(); //! //! // create the dataframe -//! let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; +//! let df = ctx +//! .read_csv("tests/data/example.csv", CsvReadOptions::new()) +//! .await?; //! //! // create a plan -//! let df = df.filter(col("a").lt_eq(col("b")))? -//! .aggregate(vec![col("a")], vec![min(col("b"))])? -//! .limit(0, Some(100))?; +//! let df = df +//! .filter(col("a").lt_eq(col("b")))? +//! .aggregate(vec![col("a")], vec![min(col("b"))])? +//! .limit(0, Some(100))?; //! //! // execute the plan //! let results: Vec = df.collect().await?; //! //! // format the results -//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)? -//! .to_string(); +//! let pretty_results = +//! arrow::util::pretty::pretty_format_batches(&results)?.to_string(); //! //! let expected = vec![ //! "+---+----------------+", //! "| a | min(?table?.b) |", //! "+---+----------------+", //! "| 1 | 2 |", -//! "+---+----------------+" +//! "+---+----------------+", //! ]; //! //! assert_eq!(pretty_results.trim().lines().collect::>(), expected); @@ -126,24 +129,27 @@ //! # async fn main() -> Result<()> { //! let ctx = SessionContext::new(); //! -//! ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?; +//! ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()) +//! .await?; //! //! // create a plan -//! let df = ctx.sql("SELECT a, MIN(b) FROM example WHERE a <= b GROUP BY a LIMIT 100").await?; +//! let df = ctx +//! .sql("SELECT a, MIN(b) FROM example WHERE a <= b GROUP BY a LIMIT 100") +//! .await?; //! //! // execute the plan //! let results: Vec = df.collect().await?; //! //! // format the results -//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)? -//! .to_string(); +//! let pretty_results = +//! arrow::util::pretty::pretty_format_batches(&results)?.to_string(); //! //! let expected = vec![ //! "+---+----------------+", //! "| a | min(example.b) |", //! "+---+----------------+", //! "| 1 | 2 |", -//! "+---+----------------+" +//! "+---+----------------+", //! ]; //! //! assert_eq!(pretty_results.trim().lines().collect::>(), expected); @@ -443,7 +449,30 @@ //! other operators read a single [`RecordBatch`] from their input to produce a //! single [`RecordBatch`] as output. //! -//! For example, given this SQL query: +//! For example, given this SQL: +//! +//! ```sql +//! SELECT name FROM 'data.parquet' WHERE id > 10 +//! ``` +//! +//! An simplified DataFusion execution plan is shown below. It first reads +//! data from the Parquet file, then applies the filter, then the projection, +//! and finally produces output. Each step processes one [`RecordBatch`] at a +//! time. Multiple batches are processed concurrently on different CPU cores +//! for plans with multiple partitions. +//! +//! ```text +//! ┌─────────────┐ ┌──────────────┐ ┌────────────────┐ ┌──────────────────┐ ┌──────────┐ +//! │ Parquet │───▶│ DataSource │───▶│ FilterExec │───▶│ ProjectionExec │───▶│ Results │ +//! │ File │ │ │ │ │ │ │ │ │ +//! └─────────────┘ └──────────────┘ └────────────────┘ └──────────────────┘ └──────────┘ +//! (reads data) (id > 10) (keeps "name" col) +//! RecordBatch ───▶ RecordBatch ────▶ RecordBatch ────▶ RecordBatch +//! ``` +//! +//! DataFusion uses the classic "pull" based control flow (explained more in the +//! next section) to implement streaming execution. As an example, +//! consider the following SQL query: //! //! ```sql //! SELECT date_trunc('month', time) FROM data WHERE id IN (10,20,30); @@ -607,7 +636,7 @@ //! └─────────────┘ ┗━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛ //! ─────────────────────────────────────────────────────────────▶ //! time -//!``` +//! ``` //! //! Note that DataFusion does not use [`tokio::task::spawn_blocking`] for //! CPU-bounded work, because `spawn_blocking` is designed for blocking **IO**, @@ -897,6 +926,12 @@ doc_comment::doctest!("../../../README.md", readme_example_test); // For example, if `user_guide_expressions(line 123)` fails, // go to `docs/source/user-guide/expressions.md` to find the relevant problem. // +#[cfg(doctest)] +doc_comment::doctest!( + "../../../docs/source/user-guide/arrow-introduction.md", + user_guide_arrow_introduction +); + #[cfg(doctest)] doc_comment::doctest!( "../../../docs/source/user-guide/concepts-readings-events.md", diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 708c52001ee8..c280b50a9f07 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -2644,7 +2644,7 @@ mod tests { // verify that the plan correctly casts u8 to i64 // the cast from u8 to i64 for literal will be simplified, and get lit(int64(5)) // the cast here is implicit so has CastOptions with safe=true - let expected = r#"BinaryExpr { left: Column { name: "c7", index: 2 }, op: Lt, right: Literal { value: Int64(5), field: Field { name: "lit", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }"#; + let expected = r#"BinaryExpr { left: Column { name: "c7", index: 2 }, op: Lt, right: Literal { value: Int64(5), field: Field { name: "lit", data_type: Int64 } }, fail_on_overflow: false"#; assert_contains!(format!("{exec_plan:?}"), expected); Ok(()) @@ -2704,9 +2704,6 @@ mod tests { name: "lit", data_type: Utf8, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c1", @@ -2718,9 +2715,6 @@ mod tests { name: "lit", data_type: Int64, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c2", @@ -2732,9 +2726,6 @@ mod tests { name: "lit", data_type: Int64, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c3", @@ -2843,9 +2834,6 @@ mod tests { name: "lit", data_type: Utf8, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c1", @@ -2857,9 +2845,6 @@ mod tests { name: "lit", data_type: Int64, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c2", @@ -2871,9 +2856,6 @@ mod tests { name: "lit", data_type: Int64, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c3", @@ -3047,7 +3029,7 @@ mod tests { .expect_err("planning error") .strip_backtrace(); - insta::assert_snapshot!(e, @r#"Error during planning: Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: DFSchema { inner: Schema { fields: [Field { name: "a", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }, ExecutionPlan schema: Schema { fields: [Field { name: "b", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }"#); + insta::assert_snapshot!(e, @r#"Error during planning: Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: DFSchema { inner: Schema { fields: [Field { name: "a", data_type: Int32 }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }, ExecutionPlan schema: Schema { fields: [Field { name: "b", data_type: Int32 }], metadata: {} }"#); } #[tokio::test] @@ -3063,7 +3045,7 @@ mod tests { let execution_plan = plan(&logical_plan).await?; // verify that the plan correctly adds cast from Int64(1) to Utf8, and the const will be evaluated. - let expected = "exprs: [ProjectionExpr { expr: BinaryExpr { left: BinaryExpr { left: Column { name: \"c1\", index: 0 }, op: Eq, right: Literal { value: Utf8(\"a\"), field: Field { name: \"lit\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }, op: Or, right: BinaryExpr { left: Column { name: \"c1\", index: 0 }, op: Eq, right: Literal { value: Utf8(\"1\"), field: Field { name: \"lit\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }, fail_on_overflow: false }"; + let expected = r#"expr: BinaryExpr { left: BinaryExpr { left: Column { name: "c1", index: 0 }, op: Eq, right: Literal { value: Utf8("a"), field: Field { name: "lit", data_type: Utf8 } }, fail_on_overflow: false }"#; assert_contains!(format!("{execution_plan:?}"), expected); @@ -3085,7 +3067,7 @@ mod tests { assert_contains!( &e, - r#"Error during planning: Can not find compatible types to compare Boolean with [Struct(foo Boolean), Utf8]"# + r#"Error during planning: Can not find compatible types to compare Boolean with [Struct("foo": Boolean), Utf8]"# ); Ok(()) diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index eb4c61c02524..203d9e97d2a8 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -40,6 +40,7 @@ use crate::prelude::{Expr, SessionConfig, SessionContext}; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::source::DataSourceExec; +use datafusion_datasource::TableSchema; use object_store::path::Path; use object_store::ObjectMeta; use parquet::arrow::ArrowWriter; @@ -186,7 +187,7 @@ impl TestParquetFile { ParquetSource::new(parquet_options) .with_predicate(Arc::clone(&physical_filter_expr)), ) - .with_schema(Arc::clone(&self.schema)); + .with_schema(TableSchema::from_file_schema(Arc::clone(&self.schema))); let config = scan_config_builder.with_source(source).build(); let parquet_exec = DataSourceExec::from_data_source(config); diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index d95eb38c19e1..265862ff9af8 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -309,16 +309,16 @@ async fn test_fn_arrow_typeof() -> Result<()> { assert_snapshot!( batches_to_string(&batches), - @r#" - +------------------------------------------------------------------------------------------------------------------+ - | arrow_typeof(test.l) | - +------------------------------------------------------------------------------------------------------------------+ - | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | - | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | - | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | - | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | - +------------------------------------------------------------------------------------------------------------------+ - "#); + @r" + +----------------------+ + | arrow_typeof(test.l) | + +----------------------+ + | List(nullable Int32) | + | List(nullable Int32) | + | List(nullable Int32) | + | List(nullable Int32) | + +----------------------+ + "); Ok(()) } diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index aa538f6dee81..043f42b18c9f 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -33,6 +33,7 @@ use arrow::error::ArrowError; use arrow::util::pretty::pretty_format_batches; use arrow_schema::{SortOptions, TimeUnit}; use datafusion::{assert_batches_eq, dataframe}; +use datafusion_common::metadata::FieldMetadata; use datafusion_functions_aggregate::count::{count_all, count_all_window}; use datafusion_functions_aggregate::expr_fn::{ array_agg, avg, avg_distinct, count, count_distinct, max, median, min, sum, @@ -65,15 +66,13 @@ use datafusion_catalog::TableProvider; use datafusion_common::test_util::{batches_to_sort_string, batches_to_string}; use datafusion_common::{ assert_contains, internal_datafusion_err, Constraint, Constraints, DFSchema, - DataFusionError, ParamValues, ScalarValue, TableReference, UnnestOptions, + DataFusionError, ScalarValue, TableReference, UnnestOptions, }; use datafusion_common_runtime::SpawnedTask; use datafusion_datasource::file_format::format_as_file_type; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; -use datafusion_expr::expr::{ - FieldMetadata, GroupingSet, NullTreatment, Sort, WindowFunction, -}; +use datafusion_expr::expr::{GroupingSet, NullTreatment, Sort, WindowFunction}; use datafusion_expr::var_provider::{VarProvider, VarType}; use datafusion_expr::{ cast, col, create_udf, exists, in_subquery, lit, out_ref_col, placeholder, @@ -2465,7 +2464,7 @@ async fn filtered_aggr_with_param_values() -> Result<()> { let df = ctx .sql("select count (c2) filter (where c3 > $1) from table1") .await? - .with_param_values(ParamValues::List(vec![ScalarValue::from(10u64)])); + .with_param_values(vec![ScalarValue::from(10u64)]); let df_results = df?.collect().await?; assert_snapshot!( @@ -2945,18 +2944,18 @@ async fn test_count_wildcard_on_window() -> Result<()> { assert_snapshot!( pretty_format_batches(&sql_results).unwrap(), @r#" - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | plan_type | plan | - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | - | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | - | | TableScan: t1 projection=[a] | - | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | - | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] | - | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | - | | DataSourceExec: partitions=1, partition_sizes=[1] | - | | | - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | + | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | + | | TableScan: t1 projection=[a] | + | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | + | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] | + | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ "# ); @@ -2979,18 +2978,18 @@ async fn test_count_wildcard_on_window() -> Result<()> { assert_snapshot!( pretty_format_batches(&df_results).unwrap(), @r#" - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | plan_type | plan | - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | - | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | - | | TableScan: t1 projection=[a] | - | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | - | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] | - | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | - | | DataSourceExec: partitions=1, partition_sizes=[1] | - | | | - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | + | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | + | | TableScan: t1 projection=[a] | + | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | + | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] | + | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ "# ); @@ -4436,12 +4435,12 @@ async fn unnest_with_redundant_columns() -> Result<()> { let actual = formatted.trim(); assert_snapshot!( actual, - @r###" + @r" Projection: shapes.shape_id [shape_id:UInt32] Unnest: lists[shape_id2|depth=1] structs[] [shape_id:UInt32, shape_id2:UInt32;N] - Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { name: "item", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { data_type: UInt32, nullable: true });N] TableScan: shapes projection=[shape_id] [shape_id:UInt32] - "### + " ); let results = df.collect().await?; @@ -6460,10 +6459,10 @@ async fn test_duplicate_state_fields_for_dfschema_construct() -> Result<()> { "ticker", "first_value(value)[first_value]", "timestamp@0", - "is_set", + "first_value(value)[first_value_is_set]", "last_value(value)[last_value]", "timestamp@0", - "is_set", + "last_value(value)[last_value_is_set]", ]; let binding = partial_agg.schema(); diff --git a/datafusion/core/tests/datasource/mod.rs b/datafusion/core/tests/datasource/mod.rs index d1f3b3957c0f..3785aa076618 100644 --- a/datafusion/core/tests/datasource/mod.rs +++ b/datafusion/core/tests/datasource/mod.rs @@ -21,3 +21,4 @@ // Include tests in csv module mod csv; +mod object_store_access; diff --git a/datafusion/core/tests/datasource/object_store_access.rs b/datafusion/core/tests/datasource/object_store_access.rs new file mode 100644 index 000000000000..d1592c21472d --- /dev/null +++ b/datafusion/core/tests/datasource/object_store_access.rs @@ -0,0 +1,751 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests for object store access patterns with [`ListingTable`]\ +//! +//! These tests setup a `ListingTable` backed by an in-memory object store +//! that counts the number of requests made against it and then do +//! various operations (table creation, queries with and without predicates) +//! to verify the expected object store access patterns. +//! +//! [`ListingTable`]: datafusion::datasource::listing::ListingTable + +use arrow::array::{ArrayRef, Int32Array, RecordBatch}; +use async_trait::async_trait; +use bytes::Bytes; +use datafusion::prelude::{CsvReadOptions, ParquetReadOptions, SessionContext}; +use futures::stream::BoxStream; +use insta::assert_snapshot; +use object_store::memory::InMemory; +use object_store::path::Path; +use object_store::{ + GetOptions, GetRange, GetResult, ListResult, MultipartUpload, ObjectMeta, + ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, +}; +use parking_lot::Mutex; +use std::fmt; +use std::fmt::{Display, Formatter}; +use std::ops::Range; +use std::sync::Arc; +use url::Url; + +#[tokio::test] +async fn create_single_csv_file() { + let test = Test::new().with_single_file_csv().await; + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 2 + - HEAD path=csv_table.csv + - GET path=csv_table.csv + " + ); +} + +#[tokio::test] +async fn query_single_csv_file() { + let test = Test::new().with_single_file_csv().await; + assert_snapshot!( + test.query("select * from csv_table").await, + @r" + ------- Query Output (2 rows) ------- + +---------+-------+-------+ + | c1 | c2 | c3 | + +---------+-------+-------+ + | 0.00001 | 5e-12 | true | + | 0.00002 | 4e-12 | false | + +---------+-------+-------+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 2 + - HEAD path=csv_table.csv + - GET (opts) path=csv_table.csv + " + ); +} + +#[tokio::test] +async fn create_multi_file_csv_file() { + let test = Test::new().with_multi_file_csv().await; + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 4 + - LIST prefix=data + - GET path=data/file_0.csv + - GET path=data/file_1.csv + - GET path=data/file_2.csv + " + ); +} + +#[tokio::test] +async fn query_multi_csv_file() { + let test = Test::new().with_multi_file_csv().await; + assert_snapshot!( + test.query("select * from csv_table").await, + @r" + ------- Query Output (6 rows) ------- + +---------+-------+-------+ + | c1 | c2 | c3 | + +---------+-------+-------+ + | 0.0 | 0.0 | true | + | 0.00003 | 5e-12 | false | + | 0.00001 | 1e-12 | true | + | 0.00003 | 5e-12 | false | + | 0.00002 | 2e-12 | true | + | 0.00003 | 5e-12 | false | + +---------+-------+-------+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 4 + - LIST prefix=data + - GET (opts) path=data/file_0.csv + - GET (opts) path=data/file_1.csv + - GET (opts) path=data/file_2.csv + " + ); +} + +#[tokio::test] +async fn create_single_parquet_file_default() { + // The default metadata size hint is 512KB + // which is enough to fetch the entire footer metadata and PageIndex + // in a single GET request. + let test = Test::new().with_single_file_parquet().await; + // expect 1 get request which reads the footer metadata and page index + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 2 + - HEAD path=parquet_table.parquet + - GET (range) range=0-2994 path=parquet_table.parquet + " + ); +} + +#[tokio::test] +async fn create_single_parquet_file_prefetch() { + // Explicitly specify a prefetch hint that is adequate for the footer and page index + let test = Test::new() + .with_parquet_metadata_size_hint(Some(1000)) + .with_single_file_parquet() + .await; + // expect 1 1000 byte request which reads the footer metadata and page index + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 2 + - HEAD path=parquet_table.parquet + - GET (range) range=1994-2994 path=parquet_table.parquet + " + ); +} + +#[tokio::test] +async fn create_single_parquet_file_too_small_prefetch() { + // configure a prefetch size that is too small to fetch the footer + // metadata + // + // Using the ranges from the test below (with no_prefetch), + // pick a number less than 730: + // -------- + // 2286-2294: (8 bytes) footer + length + // 2264-2986: (722 bytes) footer metadata + let test = Test::new() + .with_parquet_metadata_size_hint(Some(500)) + .with_single_file_parquet() + .await; + // expect three get requests: + // 1. read the footer (500 bytes per hint, not enough for the footer metadata) + // 2. Read the footer metadata + // 3. reads the PageIndex + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 4 + - HEAD path=parquet_table.parquet + - GET (range) range=2494-2994 path=parquet_table.parquet + - GET (range) range=2264-2986 path=parquet_table.parquet + - GET (range) range=2124-2264 path=parquet_table.parquet + " + ); +} + +#[tokio::test] +async fn create_single_parquet_file_small_prefetch() { + // configure a prefetch size that is large enough for the footer + // metadata but **not** the PageIndex + // + // Using the ranges from the test below (with no_prefetch), + // the 730 is determined as follows; + // -------- + // 2286-2294: (8 bytes) footer + length + // 2264-2986: (722 bytes) footer metadata + let test = Test::new() + // 740 is enough to get both the footer + length (8 bytes) + // but not the entire PageIndex + .with_parquet_metadata_size_hint(Some(740)) + .with_single_file_parquet() + .await; + // expect two get requests: + // 1. read the footer metadata + // 2. reads the PageIndex + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 3 + - HEAD path=parquet_table.parquet + - GET (range) range=2254-2994 path=parquet_table.parquet + - GET (range) range=2124-2264 path=parquet_table.parquet + " + ); +} + +#[tokio::test] +async fn create_single_parquet_file_no_prefetch() { + let test = Test::new() + // force no prefetch by setting size hint to None + .with_parquet_metadata_size_hint(None) + .with_single_file_parquet() + .await; + // Without a metadata size hint, the parquet reader + // does *three* range requests to read the footer metadata: + // 1. The footer length (last 8 bytes) + // 2. The footer metadata + // 3. The PageIndex metadata + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 2 + - HEAD path=parquet_table.parquet + - GET (range) range=0-2994 path=parquet_table.parquet + " + ); +} + +#[tokio::test] +async fn query_single_parquet_file() { + let test = Test::new().with_single_file_parquet().await; + assert_snapshot!( + test.query("select count(distinct a), count(b) from parquet_table").await, + @r" + ------- Query Output (1 rows) ------- + +---------------------------------+------------------------+ + | count(DISTINCT parquet_table.a) | count(parquet_table.b) | + +---------------------------------+------------------------+ + | 200 | 200 | + +---------------------------------+------------------------+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 3 + - HEAD path=parquet_table.parquet + - GET (ranges) path=parquet_table.parquet ranges=4-534,534-1064 + - GET (ranges) path=parquet_table.parquet ranges=1064-1594,1594-2124 + " + ); +} + +#[tokio::test] +async fn query_single_parquet_file_with_single_predicate() { + let test = Test::new().with_single_file_parquet().await; + // Note that evaluating predicates requires additional object store requests + // (to evaluate predicates) + assert_snapshot!( + test.query("select min(a), max(b) from parquet_table WHERE a > 150").await, + @r" + ------- Query Output (1 rows) ------- + +----------------------+----------------------+ + | min(parquet_table.a) | max(parquet_table.b) | + +----------------------+----------------------+ + | 151 | 1199 | + +----------------------+----------------------+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 2 + - HEAD path=parquet_table.parquet + - GET (ranges) path=parquet_table.parquet ranges=1064-1481,1481-1594,1594-2011,2011-2124 + " + ); +} + +#[tokio::test] +async fn query_single_parquet_file_multi_row_groups_multiple_predicates() { + let test = Test::new().with_single_file_parquet().await; + + // Note that evaluating predicates requires additional object store requests + // (to evaluate predicates) + assert_snapshot!( + test.query("select min(a), max(b) from parquet_table WHERE a > 50 AND b < 1150").await, + @r" + ------- Query Output (1 rows) ------- + +----------------------+----------------------+ + | min(parquet_table.a) | max(parquet_table.b) | + +----------------------+----------------------+ + | 51 | 1149 | + +----------------------+----------------------+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 3 + - HEAD path=parquet_table.parquet + - GET (ranges) path=parquet_table.parquet ranges=4-421,421-534,534-951,951-1064 + - GET (ranges) path=parquet_table.parquet ranges=1064-1481,1481-1594,1594-2011,2011-2124 + " + ); +} + +/// Runs tests with a request counting object store +struct Test { + object_store: Arc, + session_context: SessionContext, + /// metadata size hint to use when registering parquet files + /// + /// * `None`: uses the default (does not set a size_hint) + /// * `Some(None)`L: set prefetch hint to None (prefetching) + /// * `Some(Some(size))`: set prefetch hint to size + parquet_metadata_size_hint: Option>, +} + +impl Test { + fn new() -> Self { + let object_store = Arc::new(RequestCountingObjectStore::new()); + let session_context = SessionContext::new(); + session_context + .runtime_env() + .register_object_store(&Url::parse("mem://").unwrap(), object_store.clone()); + Self { + object_store, + session_context, + parquet_metadata_size_hint: None, + } + } + + /// Specify the metadata size hint to use when registering parquet files + fn with_parquet_metadata_size_hint(mut self, size_hint: Option) -> Self { + self.parquet_metadata_size_hint = Some(size_hint); + self + } + + /// Returns a string representation of all recorded requests thus far + fn requests(&self) -> String { + format!("{}", self.object_store) + } + + /// Store the specified bytes at the given path + async fn with_bytes(self, path: &str, bytes: impl Into) -> Self { + let path = Path::from(path); + self.object_store + .inner + .put(&path, PutPayload::from(bytes.into())) + .await + .unwrap(); + self + } + + /// Register a CSV file at the given path relative to the [`datafusion_test_data`] directory + async fn register_csv(self, table_name: &str, path: &str) -> Self { + let mut options = CsvReadOptions::new(); + options.has_header = true; + let url = format!("mem://{path}"); + self.session_context + .register_csv(table_name, url, options) + .await + .unwrap(); + self + } + + /// Register a Parquet file at the given path relative to the + /// [`datafusion_test_data`] directory + async fn register_parquet(self, table_name: &str, path: &str) -> Self { + let path = format!("mem://{path}"); + let mut options: ParquetReadOptions<'_> = ParquetReadOptions::new(); + + // If a metadata size hint was specified, apply it + if let Some(parquet_metadata_size_hint) = self.parquet_metadata_size_hint { + options = options.metadata_size_hint(parquet_metadata_size_hint); + } + + self.session_context + .register_parquet(table_name, path, options) + .await + .unwrap(); + self + } + + /// Register a single CSV file with three columns and two row named + /// `csv_table` + async fn with_single_file_csv(self) -> Test { + // upload CSV data to object store + let csv_data = r#"c1,c2,c3 +0.00001,5e-12,true +0.00002,4e-12,false +"#; + self.with_bytes("/csv_table.csv", csv_data) + .await + .register_csv("csv_table", "/csv_table.csv") + .await + } + + /// Register three CSV files in a directory, called `csv_table` + async fn with_multi_file_csv(mut self) -> Test { + // upload CSV data to object store + for i in 0..3 { + let csv_data1 = format!( + r#"c1,c2,c3 +0.0000{i},{i}e-12,true +0.00003,5e-12,false +"# + ); + self = self + .with_bytes(&format!("/data/file_{i}.csv"), csv_data1) + .await; + } + // register table + self.register_csv("csv_table", "/data/").await + } + + /// Add a single parquet file that has two columns and two row groups named `parquet_table` + /// + /// Column "a": Int32 with values 0-100] in row group 1 + /// and [101-200] in row group 2 + /// + /// Column "b": Int32 with values 1000-1100] in row group 1 + /// and [1101-1200] in row group 2 + async fn with_single_file_parquet(self) -> Test { + // Create parquet bytes + let a: ArrayRef = Arc::new(Int32Array::from_iter_values(0..200)); + let b: ArrayRef = Arc::new(Int32Array::from_iter_values(1000..1200)); + let batch = RecordBatch::try_from_iter([("a", a), ("b", b)]).unwrap(); + + let mut buffer = vec![]; + let props = parquet::file::properties::WriterProperties::builder() + .set_max_row_group_size(100) + .build(); + let mut writer = parquet::arrow::ArrowWriter::try_new( + &mut buffer, + batch.schema(), + Some(props), + ) + .unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + self.with_bytes("/parquet_table.parquet", buffer) + .await + .register_parquet("parquet_table", "/parquet_table.parquet") + .await + } + + /// Runs the specified query and returns a string representation of the results + /// suitable for comparison with insta snapshots + /// + /// Clears all recorded requests before running the query + async fn query(&self, sql: &str) -> String { + self.object_store.clear_requests(); + let results = self + .session_context + .sql(sql) + .await + .unwrap() + .collect() + .await + .unwrap(); + + let num_rows = results.iter().map(|batch| batch.num_rows()).sum::(); + let formatted_result = + arrow::util::pretty::pretty_format_batches(&results).unwrap(); + + let object_store = &self.object_store; + + format!( + r#"------- Query Output ({num_rows} rows) ------- +{formatted_result} +------- Object Store Request Summary ------- +{object_store} +"# + ) + } +} + +/// Details of individual requests made through the [`RequestCountingObjectStore`] +#[derive(Clone, Debug)] +enum RequestDetails { + Get { path: Path }, + GetOpts { path: Path, get_options: GetOptions }, + GetRanges { path: Path, ranges: Vec> }, + GetRange { path: Path, range: Range }, + Head { path: Path }, + List { prefix: Option }, + ListWithDelimiter { prefix: Option }, + ListWithOffset { prefix: Option, offset: Path }, +} + +fn display_range(range: &Range) -> impl Display + '_ { + struct Wrapper<'a>(&'a Range); + impl Display for Wrapper<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{}-{}", self.0.start, self.0.end) + } + } + Wrapper(range) +} +impl Display for RequestDetails { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + RequestDetails::Get { path } => { + write!(f, "GET path={path}") + } + RequestDetails::GetOpts { path, get_options } => { + write!(f, "GET (opts) path={path}")?; + if let Some(range) = &get_options.range { + match range { + GetRange::Bounded(range) => { + let range = display_range(range); + write!(f, " range={range}")?; + } + GetRange::Offset(offset) => { + write!(f, " range=offset:{offset}")?; + } + GetRange::Suffix(suffix) => { + write!(f, " range=suffix:{suffix}")?; + } + } + } + if let Some(version) = &get_options.version { + write!(f, " version={version}")?; + } + if get_options.head { + write!(f, " head=true")?; + } + Ok(()) + } + RequestDetails::GetRanges { path, ranges } => { + write!(f, "GET (ranges) path={path}")?; + if !ranges.is_empty() { + write!(f, " ranges=")?; + for (i, range) in ranges.iter().enumerate() { + if i > 0 { + write!(f, ",")?; + } + write!(f, "{}", display_range(range))?; + } + } + Ok(()) + } + RequestDetails::GetRange { path, range } => { + let range = display_range(range); + write!(f, "GET (range) range={range} path={path}") + } + RequestDetails::Head { path } => { + write!(f, "HEAD path={path}") + } + RequestDetails::List { prefix } => { + write!(f, "LIST")?; + if let Some(prefix) = prefix { + write!(f, " prefix={prefix}")?; + } + Ok(()) + } + RequestDetails::ListWithDelimiter { prefix } => { + write!(f, "LIST (with delimiter)")?; + if let Some(prefix) = prefix { + write!(f, " prefix={prefix}")?; + } + Ok(()) + } + RequestDetails::ListWithOffset { prefix, offset } => { + write!(f, "LIST (with offset) offset={offset}")?; + if let Some(prefix) = prefix { + write!(f, " prefix={prefix}")?; + } + Ok(()) + } + } + } +} + +#[derive(Debug)] +struct RequestCountingObjectStore { + /// Inner (memory) store + inner: Arc, + requests: Mutex>, +} + +impl Display for RequestCountingObjectStore { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "RequestCountingObjectStore()")?; + let requests = self.recorded_requests(); + write!(f, "\nTotal Requests: {}", requests.len())?; + for request in requests { + write!(f, "\n- {request}")?; + } + Ok(()) + } +} + +impl RequestCountingObjectStore { + pub fn new() -> Self { + let inner = Arc::new(InMemory::new()); + Self { + inner, + requests: Mutex::new(vec![]), + } + } + + pub fn clear_requests(&self) { + self.requests.lock().clear(); + } + + /// Return a copy of the recorded requests normalized + /// by removing the path prefix + pub fn recorded_requests(&self) -> Vec { + self.requests.lock().to_vec() + } +} + +#[async_trait] +impl ObjectStore for RequestCountingObjectStore { + async fn put_opts( + &self, + _location: &Path, + _payload: PutPayload, + _opts: PutOptions, + ) -> object_store::Result { + Err(object_store::Error::NotImplemented) + } + + async fn put_multipart_opts( + &self, + _location: &Path, + _opts: PutMultipartOptions, + ) -> object_store::Result> { + Err(object_store::Error::NotImplemented) + } + + async fn get(&self, location: &Path) -> object_store::Result { + let result = self.inner.get(location).await?; + self.requests.lock().push(RequestDetails::Get { + path: location.to_owned(), + }); + Ok(result) + } + + async fn get_opts( + &self, + location: &Path, + options: GetOptions, + ) -> object_store::Result { + let result = self.inner.get_opts(location, options.clone()).await?; + self.requests.lock().push(RequestDetails::GetOpts { + path: location.to_owned(), + get_options: options, + }); + Ok(result) + } + + async fn get_range( + &self, + location: &Path, + range: Range, + ) -> object_store::Result { + let result = self.inner.get_range(location, range.clone()).await?; + self.requests.lock().push(RequestDetails::GetRange { + path: location.to_owned(), + range: range.clone(), + }); + Ok(result) + } + + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> object_store::Result> { + let result = self.inner.get_ranges(location, ranges).await?; + self.requests.lock().push(RequestDetails::GetRanges { + path: location.to_owned(), + ranges: ranges.to_vec(), + }); + Ok(result) + } + + async fn head(&self, location: &Path) -> object_store::Result { + let result = self.inner.head(location).await?; + self.requests.lock().push(RequestDetails::Head { + path: location.to_owned(), + }); + Ok(result) + } + + async fn delete(&self, _location: &Path) -> object_store::Result<()> { + Err(object_store::Error::NotImplemented) + } + + fn list( + &self, + prefix: Option<&Path>, + ) -> BoxStream<'static, object_store::Result> { + self.requests.lock().push(RequestDetails::List { + prefix: prefix.map(|p| p.to_owned()), + }); + + self.inner.list(prefix) + } + + fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> BoxStream<'static, object_store::Result> { + self.requests.lock().push(RequestDetails::ListWithOffset { + prefix: prefix.map(|p| p.to_owned()), + offset: offset.to_owned(), + }); + self.inner.list_with_offset(prefix, offset) + } + + async fn list_with_delimiter( + &self, + prefix: Option<&Path>, + ) -> object_store::Result { + self.requests + .lock() + .push(RequestDetails::ListWithDelimiter { + prefix: prefix.map(|p| p.to_owned()), + }); + self.inner.list_with_delimiter(prefix).await + } + + async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> { + Err(object_store::Error::NotImplemented) + } + + async fn copy_if_not_exists( + &self, + _from: &Path, + _to: &Path, + ) -> object_store::Result<()> { + Err(object_store::Error::NotImplemented) + } +} diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs index 2abfcd8417cb..fa8ea0b31c02 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs @@ -44,7 +44,6 @@ use crate::fuzz_cases::aggregation_fuzzer::data_generator::Dataset; /// - hint `sorted` or not /// - `spilling` or not (TODO, I think a special `MemoryPool` may be needed /// to support this) -/// pub struct SessionContextGenerator { /// Current testing dataset dataset: Arc, diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs index 753a74995d8f..aaf2d1b9bad4 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs @@ -39,7 +39,6 @@ use crate::fuzz_cases::record_batch_generator::{ColumnDescr, RecordBatchGenerato /// will generate one `base dataset` firstly. Then the `base dataset` will be sorted /// based on each `sort_key` respectively. And finally `len(sort_keys) + 1` datasets /// will be returned -/// #[derive(Debug, Clone)] pub struct DatasetGeneratorConfig { /// Descriptions of columns in datasets, it's `required` @@ -115,7 +114,6 @@ impl DatasetGeneratorConfig { /// /// - Split each batch to multiple batches which each sub-batch in has the randomly `rows num`, /// and this multiple batches will be used to create the `Dataset`. -/// pub struct DatasetGenerator { batch_generator: RecordBatchGenerator, sort_keys_set: Vec>, diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs index b90b3e5e32df..1a8ef278cc29 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs @@ -253,7 +253,6 @@ impl AggregationFuzzer { /// /// - `dataset_ref`, the input dataset, store it for error reported when found /// the inconsistency between the one for `ctx` and `expected results`. -/// struct AggregationFuzzTestTask { /// Generated session context in current test case ctx_with_params: SessionContextWithParams, diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs index 209278385b7b..766e2bedd74c 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs @@ -24,7 +24,7 @@ use rand::{rng, seq::SliceRandom, Rng}; /// Creates queries like /// ```sql /// SELECT AGG(..) FROM table_name GROUP BY -///``` +/// ``` #[derive(Debug, Default, Clone)] pub struct QueryBuilder { // =================================== @@ -95,7 +95,6 @@ pub struct QueryBuilder { /// More details can see [`GroupOrdering`]. /// /// [`GroupOrdering`]: datafusion_physical_plan::aggregates::order::GroupOrdering - /// dataset_sort_keys: Vec>, /// If we will also test the no grouping case like: @@ -103,7 +102,6 @@ pub struct QueryBuilder { /// ```text /// SELECT aggr FROM t; /// ``` - /// no_grouping: bool, // ==================================== diff --git a/datafusion/core/tests/parquet/encryption.rs b/datafusion/core/tests/parquet/encryption.rs index 819d8bf3a283..09b93f06ce85 100644 --- a/datafusion/core/tests/parquet/encryption.rs +++ b/datafusion/core/tests/parquet/encryption.rs @@ -314,7 +314,7 @@ async fn verify_file_encrypted( for col in row_group.columns() { assert!(matches!( col.crypto_metadata(), - Some(ColumnCryptoMetaData::EncryptionWithFooterKey) + Some(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY) )); } } @@ -336,7 +336,7 @@ impl EncryptionFactory for MockEncryptionFactory { config: &EncryptionFactoryOptions, _schema: &SchemaRef, file_path: &object_store::path::Path, - ) -> datafusion_common::Result> { + ) -> datafusion_common::Result>> { assert_eq!( config.options.get("test_key"), Some(&"test value".to_string()) @@ -353,7 +353,7 @@ impl EncryptionFactory for MockEncryptionFactory { &self, config: &EncryptionFactoryOptions, file_path: &object_store::path::Path, - ) -> datafusion_common::Result> { + ) -> datafusion_common::Result>> { assert_eq!( config.options.get("test_key"), Some(&"test value".to_string()) diff --git a/datafusion/core/tests/parquet/filter_pushdown.rs b/datafusion/core/tests/parquet/filter_pushdown.rs index b769fec7d372..226497fe5824 100644 --- a/datafusion/core/tests/parquet/filter_pushdown.rs +++ b/datafusion/core/tests/parquet/filter_pushdown.rs @@ -631,8 +631,8 @@ async fn predicate_cache_pushdown_default() -> datafusion_common::Result<()> { #[tokio::test] async fn predicate_cache_pushdown_disable() -> datafusion_common::Result<()> { - // Can disable the cache even with filter pushdown by setting the size to 0. In this case we - // expect the inner records are reported but no records are read from the cache + // Can disable the cache even with filter pushdown by setting the size to 0. + // This results in no records read from the cache and no metrics reported let mut config = SessionConfig::new(); config.options_mut().execution.parquet.pushdown_filters = true; config @@ -641,13 +641,10 @@ async fn predicate_cache_pushdown_disable() -> datafusion_common::Result<()> { .parquet .max_predicate_cache_size = Some(0); let ctx = SessionContext::new_with_config(config); + // Since the cache is disabled, there is no reporting or use of the cache PredicateCacheTest { - // file has 8 rows, which need to be read twice, one for filter, one for - // final output - expected_inner_records: 16, - // Expect this to 0 records read as the cache is disabled. However, it is - // non zero due to https://github.com/apache/arrow-rs/issues/8307 - expected_records: 3, + expected_inner_records: 0, + expected_records: 0, } .run(&ctx) .await diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index c44d14abd381..34a48cdae374 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -37,6 +37,7 @@ use datafusion::{ prelude::{ParquetReadOptions, SessionConfig, SessionContext}, }; use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder}; +use datafusion_physical_plan::metrics::MetricValue; use parquet::arrow::ArrowWriter; use parquet::file::properties::{EnabledStatistics, WriterProperties}; use std::sync::Arc; @@ -155,8 +156,30 @@ impl TestOutput { self.metric_value("row_groups_pruned_statistics") } + /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count, + /// for testing purpose, here it only aggregate the `pruned` count. fn files_ranges_pruned_statistics(&self) -> Option { - self.metric_value("files_ranges_pruned_statistics") + let mut total_pruned = 0; + let mut found = false; + + for metric in self.parquet_metrics.iter() { + let metric = metric.as_ref(); + if metric.value().name() == "files_ranges_pruned_statistics" { + if let MetricValue::PruningMetrics { + pruning_metrics, .. + } = metric.value() + { + total_pruned += pruning_metrics.pruned(); + found = true; + } + } + } + + if found { + Some(total_pruned) + } else { + None + } } /// The number of row_groups matched by bloom filter or statistics diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs index 59cbf4b0872e..9be391a9108e 100644 --- a/datafusion/core/tests/parquet/schema_coercion.rs +++ b/datafusion/core/tests/parquet/schema_coercion.rs @@ -126,7 +126,7 @@ async fn multi_parquet_coercion_projection() { Arc::new(ParquetSource::default()), ) .with_file_group(file_group) - .with_projection(Some(vec![1, 0, 2])) + .with_projection_indices(Some(vec![1, 0, 2])) .build(); let parquet_exec = DataSourceExec::from_data_source(config); diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index 63111f43806b..db011c4be43a 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -66,9 +66,52 @@ use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr}; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use datafusion_physical_plan::union::UnionExec; use datafusion_physical_plan::{ - get_plan_string, DisplayAs, DisplayFormatType, ExecutionPlanProperties, + displayable, get_plan_string, DisplayAs, DisplayFormatType, ExecutionPlanProperties, PlanProperties, Statistics, }; +use insta::Settings; + +/// Helper function to replace only the first occurrence of a regex pattern in a plan +/// Returns (captured_group_1, modified_string) +fn hide_first( + plan: &dyn ExecutionPlan, + regex: &str, + replacement: &str, +) -> (String, String) { + let plan_str = displayable(plan).indent(true).to_string(); + let pattern = regex::Regex::new(regex).unwrap(); + + if let Some(captures) = pattern.captures(&plan_str) { + let full_match = captures.get(0).unwrap(); + let captured_value = captures + .get(1) + .map(|m| m.as_str().to_string()) + .unwrap_or_default(); + let pos = full_match.start(); + let end_pos = full_match.end(); + let mut result = String::with_capacity(plan_str.len()); + result.push_str(&plan_str[..pos]); + result.push_str(replacement); + result.push_str(&plan_str[end_pos..]); + (captured_value, result) + } else { + (String::new(), plan_str) + } +} + +macro_rules! assert_plan { + ($plan: expr, @ $expected:literal) => { + insta::assert_snapshot!( + displayable($plan.as_ref()).indent(true).to_string(), + @ $expected + ) + }; + ($plan: expr, $another_plan: expr) => { + let plan1 = displayable($plan.as_ref()).indent(true).to_string(); + let plan2 = displayable($another_plan.as_ref()).indent(true).to_string(); + assert_eq!(plan1, plan2); + } +} /// Models operators like BoundedWindowExec that require an input /// ordering but is easy to construct @@ -352,22 +395,6 @@ fn ensure_distribution_helper( ensure_distribution(distribution_context, &config).map(|item| item.data.plan) } -/// Test whether plan matches with expected plan -macro_rules! plans_matches_expected { - ($EXPECTED_LINES: expr, $PLAN: expr) => { - let physical_plan = $PLAN; - let actual = get_plan_string(&physical_plan); - - let expected_plan_lines: Vec<&str> = $EXPECTED_LINES - .iter().map(|s| *s).collect(); - - assert_eq!( - expected_plan_lines, actual, - "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_plan_lines:#?}\nactual:\n\n{actual:#?}\n\n" - ); - } -} - fn test_suite_default_config_options() -> ConfigOptions { let mut config = ConfigOptions::new(); @@ -442,6 +469,7 @@ impl TestConfig { self } + // This be deleted in https://github.com/apache/datafusion/pull/18185 /// Perform a series of runs using the current [`TestConfig`], /// assert the expected plan result, /// and return the result plan (for potential subsequent runs). @@ -517,20 +545,79 @@ impl TestConfig { Ok(optimized) } -} -macro_rules! assert_plan_txt { - ($EXPECTED_LINES: expr, $PLAN: expr) => { - let expected_lines: Vec<&str> = $EXPECTED_LINES.iter().map(|s| *s).collect(); - // Now format correctly - let actual_lines = get_plan_string(&$PLAN); + /// Perform a series of runs using the current [`TestConfig`], + /// assert the expected plan result, + /// and return the result plan (for potential subsequent runs). + fn try_to_plan( + &self, + plan: Arc, + optimizers_to_run: &[Run], + ) -> Result> { + // Add the ancillary output requirements operator at the start: + let optimizer = OutputRequirements::new_add_mode(); + let mut optimized = optimizer.optimize(plan.clone(), &self.config)?; - assert_eq!( - &expected_lines, &actual_lines, - "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - expected_lines, actual_lines - ); - }; + // This file has 2 rules that use tree node, apply these rules to original plan consecutively + // After these operations tree nodes should be in a consistent state. + // This code block makes sure that these rules doesn't violate tree node integrity. + { + let adjusted = if self.config.optimizer.top_down_join_key_reordering { + // Run adjust_input_keys_ordering rule + let plan_requirements = + PlanWithKeyRequirements::new_default(plan.clone()); + let adjusted = plan_requirements + .transform_down(adjust_input_keys_ordering) + .data() + .and_then(check_integrity)?; + // TODO: End state payloads will be checked here. + adjusted.plan + } else { + // Run reorder_join_keys_to_inputs rule + plan.clone() + .transform_up(|plan| { + Ok(Transformed::yes(reorder_join_keys_to_inputs(plan)?)) + }) + .data()? + }; + + // Then run ensure_distribution rule + DistributionContext::new_default(adjusted) + .transform_up(|distribution_context| { + ensure_distribution(distribution_context, &self.config) + }) + .data() + .and_then(check_integrity)?; + // TODO: End state payloads will be checked here. + } + + for run in optimizers_to_run { + optimized = match run { + Run::Distribution => { + let optimizer = EnforceDistribution::new(); + optimizer.optimize(optimized, &self.config)? + } + Run::Sorting => { + let optimizer = EnforceSorting::new(); + optimizer.optimize(optimized, &self.config)? + } + }; + } + + // Remove the ancillary output requirements operator when done: + let optimizer = OutputRequirements::new_remove_mode(); + let optimized = optimizer.optimize(optimized, &self.config)?; + + Ok(optimized) + } + + fn to_plan( + &self, + plan: Arc, + optimizers_to_run: &[Run], + ) -> Arc { + self.try_to_plan(plan, optimizers_to_run).unwrap() + } } #[test] @@ -556,6 +643,8 @@ fn multi_hash_joins() -> Result<()> { JoinType::RightAnti, ]; + let settings = Settings::clone_current(); + // Join on (a == b1) let join_on = vec![( Arc::new(Column::new_with_schema("a", &schema()).unwrap()) as _, @@ -564,11 +653,17 @@ fn multi_hash_joins() -> Result<()> { for join_type in join_types { let join = hash_join_exec(left.clone(), right.clone(), &join_on, &join_type); - let join_plan = |shift| -> String { - format!("{}HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(a@0, b1@1)]", " ".repeat(shift)) - }; - let join_plan_indent2 = join_plan(2); - let join_plan_indent4 = join_plan(4); + + let mut settings = settings.clone(); + settings.add_filter( + // join_type={} replace with join_type=... to avoid snapshot name issue + format!("join_type={join_type}").as_str(), + "join_type=...", + ); + + insta::allow_duplicates! { + settings.bind( || { + match join_type { JoinType::Inner @@ -589,50 +684,58 @@ fn multi_hash_joins() -> Result<()> { &top_join_on, &join_type, ); - let top_join_plan = - format!("HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(a@0, c@2)]"); - let expected = match join_type { + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + + match join_type { // Should include 3 RepartitionExecs - JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], + JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => { + + assert_plan!(plan_distrib, @r" + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, c@2)] + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + }, // Should include 4 RepartitionExecs - _ => vec![ - top_join_plan.as_str(), - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - &join_plan_indent4, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], + _ => { + assert_plan!(plan_distrib, @r" + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, c@2)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + }, }; - let test_config = TestConfig::default(); - test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected, top_join, &SORT_DISTRIB_DISTRIB)?; + + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); } JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => {} } + + match join_type { JoinType::Inner | JoinType::Left @@ -650,55 +753,64 @@ fn multi_hash_joins() -> Result<()> { let top_join = hash_join_exec(join, parquet_exec(), &top_join_on, &join_type); - let top_join_plan = match join_type { - JoinType::RightSemi | JoinType::RightAnti => - format!("HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(b1@1, c@2)]"), - _ => - format!("HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(b1@6, c@2)]"), - }; - let expected = match join_type { + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + + match join_type { // Should include 3 RepartitionExecs - JoinType::Inner | JoinType::Right | JoinType::RightSemi | JoinType::RightAnti => - vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], + JoinType::Inner | JoinType::Right => { + assert_plan!(parquet_exec(), @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet"); + }, + // Should include 3 RepartitionExecs but have a different "on" + JoinType::RightSemi | JoinType::RightAnti => { + assert_plan!(plan_distrib, @r" + HashJoinExec: mode=Partitioned, join_type=..., on=[(b1@1, c@2)] + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + + } + // Should include 4 RepartitionExecs - _ => - vec![ - top_join_plan.as_str(), - " RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10", - &join_plan_indent4, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], + _ => { + assert_plan!(plan_distrib, @r" + HashJoinExec: mode=Partitioned, join_type=..., on=[(b1@6, c@2)] + RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10 + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + + }, }; - let test_config = TestConfig::default(); - test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected, top_join, &SORT_DISTRIB_DISTRIB)?; + + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); } JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => {} } + + }); + } } Ok(()) @@ -737,23 +849,27 @@ fn multi_joins_after_alias() -> Result<()> { ); // Output partition need to respect the Alias and should not introduce additional RepartitionExec - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, c@2)]", - " ProjectionExec: expr=[a@0 as a1, a@0 as a2]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, top_join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, c@2)] + ProjectionExec: expr=[a@0 as a1, a@0 as a2] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); // Join on (a2 == c) let top_join_on = vec![( @@ -764,23 +880,27 @@ fn multi_joins_after_alias() -> Result<()> { let top_join = hash_join_exec(projection, right, &top_join_on, &JoinType::Inner); // Output partition need to respect the Alias and should not introduce additional RepartitionExec - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a2@1, c@2)]", - " ProjectionExec: expr=[a@0 as a1, a@0 as a2]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, top_join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a2@1, c@2)] + ProjectionExec: expr=[a@0 as a1, a@0 as a2] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -816,26 +936,29 @@ fn multi_joins_after_multi_alias() -> Result<()> { // The Column 'a' has different meaning now after the two Projections // The original Output partition can not satisfy the Join requirements and need to add an additional RepartitionExec - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, c@2)]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " ProjectionExec: expr=[c1@0 as a]", - " ProjectionExec: expr=[c@2 as c1]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, top_join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, c@2)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + ProjectionExec: expr=[c1@0 as a] + ProjectionExec: expr=[c@2 as c1] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -861,22 +984,26 @@ fn join_after_agg_alias() -> Result<()> { let join = hash_join_exec(left, right.clone(), &join_on, &JoinType::Inner); // Only two RepartitionExecs added - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, a2@0)]", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]", - " RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a2], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, a2@0)] + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[] + RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a2], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -914,23 +1041,27 @@ fn hash_join_key_ordering() -> Result<()> { let join = hash_join_exec(left, right.clone(), &join_on, &JoinType::Inner); // Only two RepartitionExecs added - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b1@1, b@0), (a1@0, a@1)]", - " ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]", - " AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b1@1, b@0), (a1@0, a@1)] + ProjectionExec: expr=[a1@1 as a1, b1@0 as b1] + AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[] + RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[] + RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1034,30 +1165,35 @@ fn multi_hash_join_key_ordering() -> Result<()> { Arc::new(FilterExec::try_new(predicate, top_join)?); // The bottom joins' join key ordering is adjusted based on the top join. And the top join should not introduce additional RepartitionExec - let expected = &[ - "FilterExec: c@6 > 1", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(B@2, b1@6), (C@3, c@2), (AA@1, a1@5)]", - " ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]", - " RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]", - " RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, filter_top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, filter_top_join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = + test_config.to_plan(filter_top_join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + FilterExec: c@6 > 1 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(B@2, b1@6), (C@3, c@2), (AA@1, a1@5)] + ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)] + RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)] + RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(filter_top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1168,34 +1304,34 @@ fn reorder_join_keys_to_left_input() -> Result<()> { &top_join_on, &join_type, ); - let top_join_plan = - format!("HashJoinExec: mode=Partitioned, join_type={:?}, on=[(AA@1, a1@5), (B@2, b1@6), (C@3, c@2)]", &join_type); - let reordered = reorder_join_keys_to_inputs(top_join)?; + let reordered = reorder_join_keys_to_inputs(top_join).unwrap(); // The top joins' join key ordering is adjusted based on the children inputs. - let expected = &[ - top_join_plan.as_str(), - " ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1), (c@2, c1@2)]", - " RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([a1@0, b1@1, c1@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]", - " RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - - assert_plan_txt!(expected, reordered); + let (captured_join_type, modified_plan) = + hide_first(reordered.as_ref(), r"join_type=(\w+)", "join_type=..."); + assert_eq!(captured_join_type, join_type.to_string()); + + insta::allow_duplicates! {insta::assert_snapshot!(modified_plan, @r" +HashJoinExec: mode=Partitioned, join_type=..., on=[(AA@1, a1@5), (B@2, b1@6), (C@3, c@2)] + ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1), (c@2, c1@2)] + RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([a1@0, b1@1, c1@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)] + RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +");} } Ok(()) @@ -1302,34 +1438,32 @@ fn reorder_join_keys_to_right_input() -> Result<()> { &top_join_on, &join_type, ); - let top_join_plan = - format!("HashJoinExec: mode=Partitioned, join_type={:?}, on=[(C@3, c@2), (B@2, b1@6), (AA@1, a1@5)]", &join_type); - let reordered = reorder_join_keys_to_inputs(top_join)?; + let reordered = reorder_join_keys_to_inputs(top_join).unwrap(); // The top joins' join key ordering is adjusted based on the children inputs. - let expected = &[ - top_join_plan.as_str(), - " ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1)]", - " RepartitionExec: partitioning=Hash([a@0, b@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([a1@0, b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]", - " RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - - assert_plan_txt!(expected, reordered); + let (_, plan_str) = + hide_first(reordered.as_ref(), r"join_type=(\w+)", "join_type=..."); + insta::allow_duplicates! {insta::assert_snapshot!(plan_str, @r" +HashJoinExec: mode=Partitioned, join_type=..., on=[(C@3, c@2), (B@2, b1@6), (AA@1, a1@5)] + ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1)] + RepartitionExec: partitioning=Hash([a@0, b@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([a1@0, b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)] + RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +");} } Ok(()) @@ -1670,52 +1804,52 @@ fn smj_join_key_ordering() -> Result<()> { // Test: run EnforceDistribution, then EnforceSort. // Only two RepartitionExecs added - let expected = &[ - "SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]", - " SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[true]", - " ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]", - " ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]", - " AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[true]", - " ProjectionExec: expr=[a@1 as a2, b@0 as b2]", - " AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected, join.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, @r" +SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)] + SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[a1@0 as a3, b1@1 as b3] + ProjectionExec: expr=[a1@1 as a1, b1@0 as b1] + AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[] + RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[a@1 as a2, b@0 as b2] + AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[] + RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]", - " RepartitionExec: partitioning=Hash([b3@1, a3@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b3@1 ASC, a3@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]", - " ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]", - " AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b2@1, a2@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b2@1 ASC, a2@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " ProjectionExec: expr=[a@1 as a2, b@0 as b2]", - " AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, join, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, @r" +SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)] + RepartitionExec: partitioning=Hash([b3@1, a3@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b3@1 ASC, a3@0 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + ProjectionExec: expr=[a1@0 as a3, b1@1 as b3] + ProjectionExec: expr=[a1@1 as a1, b1@0 as b1] + AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[] + RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b2@1, a2@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b2@1 ASC, a2@0 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + ProjectionExec: expr=[a@1 as a2, b@0 as b2] + AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[] + RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); Ok(()) } @@ -1744,13 +1878,14 @@ fn merge_does_not_need_sort() -> Result<()> { // // The optimizer should not add an additional SortExec as the // data is already sorted - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - " CoalesceBatchesExec: target_batch_size=4096", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, exec.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(exec.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + CoalesceBatchesExec: target_batch_size=4096 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: // @@ -1758,13 +1893,14 @@ fn merge_does_not_need_sort() -> Result<()> { // (according to flag: PREFER_EXISTING_SORT) // hence in this case ordering lost during CoalescePartitionsExec and re-introduced with // SortExec at the top. - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " CoalesceBatchesExec: target_batch_size=4096", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, exec, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(exec, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + CoalesceBatchesExec: target_batch_size=4096 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); Ok(()) } @@ -1790,25 +1926,26 @@ fn union_to_interleave() -> Result<()> { aggregate_exec_with_alias(plan, vec![("a1".to_string(), "a2".to_string())]); // Only two RepartitionExecs added, no final RepartitionExec required - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]", - " AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[]", - " InterleaveExec", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[] + AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[] + InterleaveExec + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1834,28 +1971,29 @@ fn union_not_to_interleave() -> Result<()> { aggregate_exec_with_alias(plan, vec![("a1".to_string(), "a2".to_string())]); // Only two RepartitionExecs added, no final RepartitionExec required - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]", - " RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=20", - " AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[]", - " UnionExec", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - // TestConfig: Prefer existing union. let test_config = TestConfig::default().with_prefer_existing_union(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[] + RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=20 + AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[] + UnionExec + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1865,17 +2003,18 @@ fn added_repartition_to_single_partition() -> Result<()> { let alias = vec![("a".to_string(), "a".to_string())]; let plan = aggregate_exec_with_alias(parquet_exec(), alias); - let expected = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(&expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1885,18 +2024,19 @@ fn repartition_deepest_node() -> Result<()> { let alias = vec![("a".to_string(), "a".to_string())]; let plan = aggregate_exec_with_alias(filter_exec(parquet_exec()), alias); - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1905,19 +2045,20 @@ fn repartition_deepest_node() -> Result<()> { fn repartition_unsorted_limit() -> Result<()> { let plan = limit_exec(filter_exec(parquet_exec())); - let expected = &[ - "GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // nothing sorts the data, so the local limit doesn't require sorted data either - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + // nothing sorts the data, so the local limit doesn't require sorted data either + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1932,17 +2073,18 @@ fn repartition_sorted_limit() -> Result<()> { .into(); let plan = limit_exec(sort_exec(sort_key, parquet_exec())); - let expected = &[ - "GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - // data is sorted so can't repartition here - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // data is sorted so can't repartition here + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1960,19 +2102,20 @@ fn repartition_sorted_limit_with_filter() -> Result<()> { sort_key, ); - let expected = &[ - "SortRequiredExec: [c@2 ASC]", - " FilterExec: c@2 = 0", - // We can use repartition here, ordering requirement by SortRequiredExec - // is still satisfied. - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortRequiredExec: [c@2 ASC] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // We can use repartition here, ordering requirement by SortRequiredExec + // is still satisfied. + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1985,26 +2128,28 @@ fn repartition_ignores_limit() -> Result<()> { alias, ); - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // repartition should happen prior to the filter to maximize parallelism - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - // Expect no repartition to happen for local limit - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // repartition should happen prior to the filter to maximize parallelism + // Expect no repartition to happen for local limit (DataSourceExec) + + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2013,19 +2158,20 @@ fn repartition_ignores_limit() -> Result<()> { fn repartition_ignores_union() -> Result<()> { let plan = union_exec(vec![parquet_exec(); 5]); - let expected = &[ - "UnionExec", - // Expect no repartition of DataSourceExec - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Expect no repartition of DataSourceExec + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2041,15 +2187,15 @@ fn repartition_through_sort_preserving_merge() -> Result<()> { .into(); let plan = sort_preserving_merge_exec(sort_key, parquet_exec()); - // need resort as the data was not sorted correctly - let expected = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2068,24 +2214,25 @@ fn repartition_ignores_sort_preserving_merge() -> Result<()> { parquet_exec_multiple_sorted(vec![sort_key]), ); + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); // Test: run EnforceDistribution, then EnforceSort - // + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [c@2 ASC] + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); // should not sort (as the data was already sorted) // should not repartition, since increased parallelism is not beneficial for SortPReservingMerge - let expected = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); Ok(()) } @@ -2105,27 +2252,29 @@ fn repartition_ignores_sort_preserving_merge_with_union() -> Result<()> { ]); let plan = sort_preserving_merge_exec(sort_key, input); + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); // Test: run EnforceDistribution, then EnforceSort. + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [c@2 ASC] + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); // // should not repartition / sort (as the data was already sorted) - let expected = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; // test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); Ok(()) } @@ -2149,16 +2298,17 @@ fn repartition_does_not_destroy_sort() -> Result<()> { // TestConfig: Prefer existing sort. let test_config = TestConfig::default().with_prefer_existing_sort(); + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortRequiredExec: [d@3 ASC] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet +"); // during repartitioning ordering is preserved - let expected = &[ - "SortRequiredExec: [d@3 ASC]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet", - ]; - - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2187,22 +2337,25 @@ fn repartition_does_not_destroy_sort_more_complex() -> Result<()> { let input2 = filter_exec(parquet_exec()); let plan = union_exec(vec![input1, input2]); + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +UnionExec + SortRequiredExec: [c@2 ASC] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // union input 1: no repartitioning + // union input 2: should repartition + // // should not repartition below the SortRequired as that // branch doesn't benefit from increased parallelism - let expected = &[ - "UnionExec", - // union input 1: no repartitioning - " SortRequiredExec: [c@2 ASC]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - // union input 2: should repartition - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2227,28 +2380,28 @@ fn repartition_transitively_with_projection() -> Result<()> { .into(); let plan = sort_preserving_merge_exec(sort_key, proj); - // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [sum@0 ASC]", - " SortExec: expr=[sum@0 ASC], preserve_partitioning=[true]", - // Since this projection is not trivial, increasing parallelism is beneficial - " ProjectionExec: expr=[a@0 + b@1 as sum]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [sum@0 ASC] + SortExec: expr=[sum@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[a@0 + b@1 as sum] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[sum@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - // Since this projection is not trivial, increasing parallelism is beneficial - " ProjectionExec: expr=[a@0 + b@1 as sum]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[sum@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + ProjectionExec: expr=[a@0 + b@1 as sum] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Since this projection is not trivial, increasing parallelism is beneficial Ok(()) } @@ -2275,16 +2428,18 @@ fn repartition_ignores_transitively_with_projection() -> Result<()> { sort_key, ); - let expected = &[ - "SortRequiredExec: [c@2 ASC]", - // Since this projection is trivial, increasing parallelism is not beneficial - " ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortRequiredExec: [c@2 ASC] + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c] + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + // Since this projection is trivial, increasing parallelism is not beneficial + + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2310,16 +2465,17 @@ fn repartition_transitively_past_sort_with_projection() -> Result<()> { ), ); - let expected = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // Since this projection is trivial, increasing parallelism is not beneficial - " ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Since this projection is trivial, increasing parallelism is not beneficial + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2334,28 +2490,30 @@ fn repartition_transitively_past_sort_with_filter() -> Result<()> { .into(); let plan = sort_exec(sort_key, filter_exec(parquet_exec())); - // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - // Expect repartition on the input to the sort (as it can benefit from additional parallelism) - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + + // Expect repartition on the input to the sort (as it can benefit from additional parallelism) // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " FilterExec: c@2 = 0", - // Expect repartition on the input of the filter (as it can benefit from additional parallelism) - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Expect repartition on the input of the filter (as it can benefit from additional parallelism) Ok(()) } @@ -2381,30 +2539,32 @@ fn repartition_transitively_past_sort_with_projection_and_filter() -> Result<()> ), ); - // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - // Expect repartition on the input to the sort (as it can benefit from additional parallelism) - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]", - " FilterExec: c@2 = 0", - // repartition is lowest down - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + + // Expect repartition on the input to the sort (as it can benefit from additional parallelism) + // repartition is lowest down // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); Ok(()) } @@ -2420,28 +2580,29 @@ fn parallelization_single_partition() -> Result<()> { .with_query_execution_partitions(2); // Test: with parquet - let expected_parquet = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - &expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(&expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(&expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2465,40 +2626,31 @@ fn parallelization_multiple_files() -> Result<()> { // The groups must have only contiguous ranges of rows from the same file // if any group has rows from multiple files, the data is no longer sorted destroyed // https://github.com/apache/datafusion/issues/8451 - let expected_with_3_target_partitions = [ - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={3 groups: [[x:0..50], [y:0..100], [x:50..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; let test_config_concurrency_3 = test_config.clone().with_query_execution_partitions(3); - test_config_concurrency_3.run( - &expected_with_3_target_partitions, - plan.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config_concurrency_3.run( - &expected_with_3_target_partitions, - plan.clone(), - &SORT_DISTRIB_DISTRIB, - )?; + let plan_3_distrib = + test_config_concurrency_3.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_3_distrib, + @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={3 groups: [[x:0..50], [y:0..100], [x:50..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); + let plan_3_sort = + test_config_concurrency_3.to_plan(plan.clone(), &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_3_distrib, plan_3_sort); - let expected_with_8_target_partitions = [ - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={8 groups: [[x:0..25], [y:0..25], [x:25..50], [y:25..50], [x:50..75], [y:50..75], [x:75..100], [y:75..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; let test_config_concurrency_8 = test_config.with_query_execution_partitions(8); - test_config_concurrency_8.run( - &expected_with_8_target_partitions, - plan.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config_concurrency_8.run( - &expected_with_8_target_partitions, - plan, - &SORT_DISTRIB_DISTRIB, - )?; + let plan_8_distrib = + test_config_concurrency_8.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_8_distrib, + @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={8 groups: [[x:0..25], [y:0..25], [x:25..50], [y:25..50], [x:50..75], [y:50..75], [x:75..100], [y:75..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); + let plan_8_sort = test_config_concurrency_8.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_8_distrib, plan_8_sort); Ok(()) } @@ -2570,30 +2722,30 @@ fn parallelization_two_partitions() -> Result<()> { .with_prefer_repartition_file_scans(10); // Test: with parquet - let expected_parquet = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - // Plan already has two partitions - " DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - &expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(&expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Plan already has two partitions + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - // Plan already has two partitions - " DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(&expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + // Plan already has two partitions + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2609,30 +2761,32 @@ fn parallelization_two_partitions_into_four() -> Result<()> { .with_prefer_repartition_file_scans(10); // Test: with parquet - let expected_parquet = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - // Multiple source files split across partitions - " DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - &expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(&expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + // Multiple source files split across partitions + assert_plan!(plan_parquet_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Multiple source files split across partitions + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - // Multiple source files split across partitions - " DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(&expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + // Multiple source files split across partitions + assert_plan!(plan_csv_distrib, @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + // Multiple source files split across partitions + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2651,32 +2805,32 @@ fn parallelization_sorted_limit() -> Result<()> { let test_config = TestConfig::default(); // Test: with parquet - let expected_parquet = &[ - "GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - // data is sorted so can't repartition here - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // Doesn't parallelize for SortExec without preserve_partitioning - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, @r" +GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // data is sorted so can't repartition here + // Doesn't parallelize for SortExec without preserve_partitioning + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - // data is sorted so can't repartition here - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // Doesn't parallelize for SortExec without preserve_partitioning - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" +GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + // data is sorted so can't repartition here + // Doesn't parallelize for SortExec without preserve_partitioning + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2696,40 +2850,41 @@ fn parallelization_limit_with_filter() -> Result<()> { let test_config = TestConfig::default(); // Test: with parquet - let expected_parquet = &[ - "GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // even though data is sorted, we can use repartition here. Since - // ordering is not used in subsequent stages anyway. - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // SortExec doesn't benefit from input partitioning - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + // even though data is sorted, we can use repartition here. Since + // ordering is not used in subsequent stages anyway. + // SortExec doesn't benefit from input partitioning + assert_plan!(plan_parquet_distrib, + @r" +GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // even though data is sorted, we can use repartition here. Since - // ordering is not used in subsequent stages anyway. - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // SortExec doesn't benefit from input partitioning - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + // even though data is sorted, we can use repartition here. Since + // ordering is not used in subsequent stages anyway. + // SortExec doesn't benefit from input partitioning + assert_plan!(plan_csv_distrib, + @r" +GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2747,48 +2902,49 @@ fn parallelization_ignores_limit() -> Result<()> { let test_config = TestConfig::default(); // Test: with parquet - let expected_parquet = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // repartition should happen prior to the filter to maximize parallelism - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - // Limit doesn't benefit from input partitioning - no parallelism - " LocalLimitExec: fetch=100", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + // repartition should happen prior to the filter to maximize parallelism + // Limit doesn't benefit from input partitioning - no parallelism + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // repartition should happen prior to the filter to maximize parallelism - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - // Limit doesn't benefit from input partitioning - no parallelism - " LocalLimitExec: fetch=100", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + "); + // repartition should happen prior to the filter to maximize parallelism + // Limit doesn't benefit from input partitioning - no parallelism + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2801,34 +2957,35 @@ fn parallelization_union_inputs() -> Result<()> { let test_config = TestConfig::default(); // Test: with parquet - let expected_parquet = &[ - "UnionExec", - // Union doesn't benefit from input partitioning - no parallelism - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" +UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Union doesn't benefit from input partitioning - no parallelism + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "UnionExec", - // Union doesn't benefit from input partitioning - no parallelism - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" +UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + // Union doesn't benefit from input partitioning - no parallelism + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2855,22 +3012,21 @@ fn parallelization_prior_to_sort_preserving_merge() -> Result<()> { // parallelization is not beneficial for SortPreservingMerge // Test: with parquet - let expected_parquet = &[ - "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet" + ); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false" + ); + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2900,54 +3056,47 @@ fn parallelization_sort_preserving_merge_with_union() -> Result<()> { // should not sort (as the data was already sorted) // Test: with parquet - let expected_parquet = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - let expected_parquet_first_sort_enforcement = &[ - // no SPM - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // has coalesce - " CoalescePartitionsExec", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_parquet_first_sort_enforcement, - plan_parquet, - &SORT_DISTRIB_DISTRIB, - )?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" + SortPreservingMergeExec: [c@2 ASC] + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + "); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_sort, + @r" + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + "); + // no SPM + // has coalesce // Test: with csv - let expected_csv = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - let expected_csv_first_sort_enforcement = &[ - // no SPM - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // has coalesce - " CoalescePartitionsExec", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - test_config.run( - expected_csv_first_sort_enforcement, - plan_csv.clone(), - &SORT_DISTRIB_DISTRIB, - )?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" + SortPreservingMergeExec: [c@2 ASC] + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + "); + let plan_csv_sort = test_config.to_plan(plan_csv.clone(), &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_sort, + @r" + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + "); + // no SPM + // has coalesce Ok(()) } @@ -2975,24 +3124,25 @@ fn parallelization_does_not_benefit() -> Result<()> { // no parallelization, because SortRequiredExec doesn't benefit from increased parallelism // Test: with parquet - let expected_parquet = &[ - "SortRequiredExec: [c@2 ASC]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" + SortRequiredExec: [c@2 ASC] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + "); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "SortRequiredExec: [c@2 ASC]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" + SortRequiredExec: [c@2 ASC] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + "); + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -3023,26 +3173,26 @@ fn parallelization_ignores_transitively_with_projection_parquet() -> Result<()> .into(); let plan_parquet = sort_preserving_merge_exec(sort_key_after_projection, proj_parquet); - let expected = &[ - "SortPreservingMergeExec: [c2@1 ASC]", - " ProjectionExec: expr=[a@0 as a2, c@2 as c2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - plans_matches_expected!(expected, &plan_parquet); + assert_plan!(plan_parquet, + @r" + SortPreservingMergeExec: [c2@1 ASC] + ProjectionExec: expr=[a@0 as a2, c@2 as c2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + "); + + let test_config = TestConfig::default(); + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); // Expected Outcome: // data should not be repartitioned / resorted - let expected_parquet = &[ - "ProjectionExec: expr=[a@0 as a2, c@2 as c2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + assert_plan!(plan_parquet_distrib, + @r" +ProjectionExec: expr=[a@0 as a2, c@2 as c2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); Ok(()) } @@ -3071,22 +3221,24 @@ fn parallelization_ignores_transitively_with_projection_csv() -> Result<()> { }] .into(); let plan_csv = sort_preserving_merge_exec(sort_key_after_projection, proj_csv); - let expected = &[ - "SortPreservingMergeExec: [c2@1 ASC]", - " ProjectionExec: expr=[a@0 as a2, c@2 as c2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - plans_matches_expected!(expected, &plan_csv); + assert_plan!(plan_csv, + @r" +SortPreservingMergeExec: [c2@1 ASC] + ProjectionExec: expr=[a@0 as a2, c@2 as c2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false +"); + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +ProjectionExec: expr=[a@0 as a2, c@2 as c2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false +"); // Expected Outcome: // data should not be repartitioned / resorted - let expected_csv = &[ - "ProjectionExec: expr=[a@0 as a2, c@2 as c2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - let test_config = TestConfig::default(); - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3096,24 +3248,25 @@ fn remove_redundant_roundrobins() -> Result<()> { let input = parquet_exec(); let repartition = repartition_exec(repartition_exec(input)); let physical_plan = repartition_exec(filter_exec(repartition)); - let expected = &[ - "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - plans_matches_expected!(expected, &physical_plan); - - let expected = &[ - "FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; + assert_plan!(physical_plan, + @r" +RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3133,18 +3286,19 @@ fn remove_unnecessary_spm_after_filter() -> Result<()> { // TestConfig: Prefer existing sort. let test_config = TestConfig::default().with_prefer_existing_sort(); + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); // Expected Outcome: // Original plan expects its output to be ordered by c@2 ASC. // This is still satisfied since, after filter that column is constant. - let expected = &[ - "CoalescePartitionsExec", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c@2 ASC", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + assert_plan!(plan_distrib, + @r" +CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c@2 ASC + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3164,14 +3318,16 @@ fn preserve_ordering_through_repartition() -> Result<()> { // TestConfig: Prefer existing sort. let test_config = TestConfig::default().with_prefer_existing_sort(); - let expected = &[ - "SortPreservingMergeExec: [d@3 ASC]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=d@3 ASC", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [d@3 ASC] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=d@3 ASC + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3189,29 +3345,27 @@ fn do_not_preserve_ordering_through_repartition() -> Result<()> { let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - test_config.run( - expected_first_sort_enforcement, - physical_plan, - &SORT_DISTRIB_DISTRIB, - )?; + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); Ok(()) } @@ -3227,17 +3381,18 @@ fn no_need_for_sort_after_filter() -> Result<()> { let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]); let physical_plan = sort_preserving_merge_exec(sort_key, filter_exec(input)); - let expected = &[ - // After CoalescePartitionsExec c is still constant. Hence c@2 ASC ordering is already satisfied. - "CoalescePartitionsExec", - // Since after this stage c is constant. c@2 ASC ordering is already satisfied. - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, @r" +CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); + // After CoalescePartitionsExec c is still constant. Hence c@2 ASC ordering is already satisfied. + // Since after this stage c is constant. c@2 ASC ordering is already satisfied. Ok(()) } @@ -3261,30 +3416,28 @@ fn do_not_preserve_ordering_through_repartition2() -> Result<()> { let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_first_sort_enforcement, - physical_plan, - &SORT_DISTRIB_DISTRIB, - )?; + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); Ok(()) } @@ -3300,14 +3453,16 @@ fn do_not_preserve_ordering_through_repartition3() -> Result<()> { let input = parquet_exec_multiple_sorted(vec![sort_key]); let physical_plan = filter_exec(input); - let expected = &[ - "FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3322,30 +3477,27 @@ fn do_not_put_sort_when_input_is_invalid() -> Result<()> { .into(); let input = parquet_exec(); let physical_plan = sort_required_exec_with_req(filter_exec(input), sort_key); - let expected = &[ - // Ordering requirement of sort required exec is NOT satisfied - // by existing ordering at the source. - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - assert_plan_txt!(expected, physical_plan); - - let expected = &[ - "SortRequiredExec: [a@0 ASC]", - // Since at the start of the rule ordering requirement is not satisfied - // EnforceDistribution rule doesn't satisfy this requirement either. - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; + // Ordering requirement of sort required exec is NOT satisfied + // by existing ordering at the source. + assert_plan!(physical_plan, @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); let mut config = ConfigOptions::new(); config.execution.target_partitions = 10; config.optimizer.enable_round_robin_repartition = true; config.optimizer.prefer_existing_sort = false; let dist_plan = EnforceDistribution::new().optimize(physical_plan, &config)?; - assert_plan_txt!(expected, dist_plan); + // Since at the start of the rule ordering requirement is not satisfied + // EnforceDistribution rule doesn't satisfy this requirement either. + assert_plan!(dist_plan, @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); Ok(()) } @@ -3361,29 +3513,26 @@ fn put_sort_when_input_is_valid() -> Result<()> { let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]); let physical_plan = sort_required_exec_with_req(filter_exec(input), sort_key); - let expected = &[ - // Ordering requirement of sort required exec is satisfied - // by existing ordering at the source. - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - assert_plan_txt!(expected, physical_plan); - - let expected = &[ - // Since at the start of the rule ordering requirement is satisfied - // EnforceDistribution rule satisfy this requirement also. - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={10 groups: [[x:0..20], [y:0..20], [x:20..40], [y:20..40], [x:40..60], [y:40..60], [x:60..80], [y:60..80], [x:80..100], [y:80..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; + // Ordering requirement of sort required exec is satisfied + // by existing ordering at the source. + assert_plan!(physical_plan, @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); let mut config = ConfigOptions::new(); config.execution.target_partitions = 10; config.optimizer.enable_round_robin_repartition = true; config.optimizer.prefer_existing_sort = false; let dist_plan = EnforceDistribution::new().optimize(physical_plan, &config)?; - assert_plan_txt!(expected, dist_plan); + // Since at the start of the rule ordering requirement is satisfied + // EnforceDistribution rule satisfy this requirement also. + assert_plan!(dist_plan, @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={10 groups: [[x:0..20], [y:0..20], [x:20..40], [y:20..40], [x:40..60], [y:40..60], [x:60..80], [y:60..80], [x:80..100], [y:80..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); Ok(()) } @@ -3404,13 +3553,15 @@ fn do_not_add_unnecessary_hash() -> Result<()> { // Make sure target partition number is 1. In this case hash repartition is unnecessary. let test_config = TestConfig::default().with_query_execution_partitions(1); - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3432,19 +3583,21 @@ fn do_not_add_unnecessary_hash2() -> Result<()> { // Make sure target partition number is larger than 2 (e.g partition number at the source). let test_config = TestConfig::default().with_query_execution_partitions(4); - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - // Since hash requirements of this operator is satisfied. There shouldn't be - // a hash repartition here - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + // Since hash requirements of this operator is satisfied. There shouldn't be + // a hash repartition here + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3452,19 +3605,21 @@ fn do_not_add_unnecessary_hash2() -> Result<()> { #[test] fn optimize_away_unnecessary_repartition() -> Result<()> { let physical_plan = coalesce_partitions_exec(repartition_exec(parquet_exec())); - let expected = &[ - "CoalescePartitionsExec", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - plans_matches_expected!(expected, physical_plan.clone()); - - let expected = - &["DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet"]; + assert_plan!(physical_plan, + @r" +CoalescePartitionsExec + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3474,25 +3629,27 @@ fn optimize_away_unnecessary_repartition2() -> Result<()> { let physical_plan = filter_exec(repartition_exec(coalesce_partitions_exec( filter_exec(repartition_exec(parquet_exec())), ))); - let expected = &[ - "FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " CoalescePartitionsExec", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - plans_matches_expected!(expected, physical_plan.clone()); + assert_plan!(physical_plan, + @r" +FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); - let expected = &[ - "FilterExec: c@2 = 0", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +FilterExec: c@2 = 0 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3512,27 +3669,31 @@ async fn test_distribute_sort_parquet() -> Result<()> { let physical_plan = sort_exec(sort_key, parquet_exec_with_stats(10000 * 8192)); // prior to optimization, this is the starting plan - let starting = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - plans_matches_expected!(starting, physical_plan.clone()); + assert_plan!(physical_plan, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); // what the enforce distribution run does. - let expected = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &[Run::Distribution])?; + let plan_distribution = + test_config.to_plan(physical_plan.clone(), &[Run::Distribution]); + assert_plan!(plan_distribution, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet +"); // what the sort parallelization (in enforce sorting), does after the enforce distribution changes - let expected = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[true]", - " DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected, physical_plan, &[Run::Distribution, Run::Sorting])?; + let plan_both = + test_config.to_plan(physical_plan, &[Run::Distribution, Run::Sorting]); + assert_plan!(plan_both, + @r" +SortPreservingMergeExec: [c@2 ASC] + SortExec: expr=[c@2 ASC], preserve_partitioning=[true] + DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet +"); Ok(()) } @@ -3557,12 +3718,12 @@ async fn test_distribute_sort_memtable() -> Result<()> { let physical_plan = dataframe.create_physical_plan().await?; // this is the final, optimized plan - let expected = &[ - "SortPreservingMergeExec: [id@0 ASC NULLS LAST]", - " SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]", - " DataSourceExec: partitions=3, partition_sizes=[34, 33, 33]", - ]; - plans_matches_expected!(expected, physical_plan); + assert_plan!(physical_plan, + @r" +SortPreservingMergeExec: [id@0 ASC NULLS LAST] + SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] + DataSourceExec: partitions=3, partition_sizes=[34, 33, 33] +"); Ok(()) } diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index ad77a453350f..620259821871 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -667,12 +667,12 @@ async fn test_soft_hard_requirements_remove_soft_requirement() -> Result<()> { let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet "#); @@ -716,13 +716,13 @@ async fn test_soft_hard_requirements_remove_soft_requirement_without_pushdowns( assert_snapshot!(test.run(), @r#" Input Plan: ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as count] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as count] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet "#); @@ -763,13 +763,13 @@ async fn test_soft_hard_requirements_remove_soft_requirement_without_pushdowns( let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -824,15 +824,15 @@ async fn test_soft_hard_requirements_multiple_soft_requirements() -> Result<()> let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -889,17 +889,17 @@ async fn test_soft_hard_requirements_multiple_soft_requirements() -> Result<()> let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -961,14 +961,14 @@ async fn test_soft_hard_requirements_multiple_sorts() -> Result<()> { Input Plan: SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -1023,16 +1023,16 @@ async fn test_soft_hard_requirements_with_multiple_soft_requirements_and_output_ assert_snapshot!(test.run(), @r#" Input Plan: OutputRequirementExec: order_by=[(non_nullable_col@1, asc)], dist_by=SinglePartition - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: OutputRequirementExec: order_by=[(non_nullable_col@1, asc)], dist_by=SinglePartition - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet "#); @@ -1081,7 +1081,7 @@ async fn test_window_multi_path_sort() -> Result<()> { let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [nullable_col@0 DESC NULLS LAST] UnionExec SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -1090,7 +1090,7 @@ async fn test_window_multi_path_sort() -> Result<()> { DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet Optimized Plan: - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] SortPreservingMergeExec: [nullable_col@0 ASC] UnionExec DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], file_type=parquet @@ -1122,7 +1122,7 @@ async fn test_window_multi_path_sort2() -> Result<()> { let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC] UnionExec SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false] @@ -1131,7 +1131,7 @@ async fn test_window_multi_path_sort2() -> Result<()> { DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [nullable_col@0 ASC] UnionExec DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet @@ -1678,7 +1678,7 @@ async fn test_window_multi_layer_requirement() -> Result<()> { EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [a@0 ASC, b@1 ASC] RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC, b@1 ASC RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 @@ -1686,7 +1686,7 @@ async fn test_window_multi_layer_requirement() -> Result<()> { DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [a@0 ASC, b@1 ASC] SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10 @@ -1783,18 +1783,18 @@ async fn test_remove_unnecessary_sort_window_multilayer() -> Result<()> { EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] FilterExec: NOT non_nullable_col@1 SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] CoalesceBatchesExec: target_batch_size=128 SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] Optimized Plan: - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] FilterExec: NOT non_nullable_col@1 - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] CoalesceBatchesExec: target_batch_size=128 SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] @@ -2238,17 +2238,17 @@ async fn test_multiple_sort_window_exec() -> Result<()> { EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] "#); @@ -2273,7 +2273,7 @@ async fn test_commutativity() -> Result<()> { assert_snapshot!(displayable(orig_plan.as_ref()).indent(true), @r#" SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: partitions=1, partition_sizes=[0] "#); @@ -2483,7 +2483,6 @@ async fn test_not_replaced_with_partial_sort_for_unbounded_input() -> Result<()> Ok(()) } -// Test that verifies that an orthogonal sort (a sort on columns not in the input ordering) #[test] fn test_removes_unused_orthogonal_sort() -> Result<()> { let schema = create_test_schema3()?; diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs index 7d6c0484b624..ef233e222912 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs @@ -229,11 +229,11 @@ fn test_window_partial_constant_and_set_monotonicity_0() { @ r#" Input Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet Optimized Plan: - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -253,11 +253,11 @@ fn test_window_partial_constant_and_set_monotonicity_1() { @ r#" Input Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet Optimized Plan: - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -275,15 +275,15 @@ fn test_window_partial_constant_and_set_monotonicity_2() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -299,15 +299,15 @@ fn test_window_partial_constant_and_set_monotonicity_3() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -323,16 +323,16 @@ fn test_window_partial_constant_and_set_monotonicity_4() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -348,16 +348,16 @@ fn test_window_partial_constant_and_set_monotonicity_5() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[non_nullable_col@1 DESC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[non_nullable_col@1 DESC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -SortExec: expr=[non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + SortExec: expr=[non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -373,16 +373,16 @@ fn test_window_partial_constant_and_set_monotonicity_6() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[min@2 ASC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[min@2 ASC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -398,16 +398,16 @@ fn test_window_partial_constant_and_set_monotonicity_7() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -427,15 +427,15 @@ fn test_window_partial_constant_and_set_monotonicity_8() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -451,15 +451,15 @@ fn test_window_partial_constant_and_set_monotonicity_9() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -477,7 +477,7 @@ fn test_window_partial_constant_and_set_monotonicity_10() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -497,7 +497,7 @@ fn test_window_partial_constant_and_set_monotonicity_11() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -522,7 +522,7 @@ fn test_window_partial_constant_and_set_monotonicity_12() { @ r#" Input / Optimized Plan: SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -543,7 +543,7 @@ fn test_window_partial_constant_and_set_monotonicity_13() { @ r#" Input / Optimized Plan: SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -564,7 +564,7 @@ fn test_window_partial_constant_and_set_monotonicity_14() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -585,7 +585,7 @@ fn test_window_partial_constant_and_set_monotonicity_15() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -608,15 +608,15 @@ fn test_window_partial_constant_and_set_monotonicity_16() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -633,15 +633,15 @@ fn test_window_partial_constant_and_set_monotonicity_17() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[max@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[max@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -658,15 +658,15 @@ fn test_window_partial_constant_and_set_monotonicity_18() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[min@2 ASC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[min@2 ASC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -685,7 +685,7 @@ fn test_window_partial_constant_and_set_monotonicity_19() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -710,7 +710,7 @@ fn test_window_partial_constant_and_set_monotonicity_20() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -729,15 +729,15 @@ fn test_window_partial_constant_and_set_monotonicity_21() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -756,7 +756,7 @@ fn test_window_partial_constant_and_set_monotonicity_22() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -777,7 +777,7 @@ fn test_window_partial_constant_and_set_monotonicity_23() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -800,15 +800,15 @@ fn test_window_partial_constant_and_set_monotonicity_24() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -827,7 +827,7 @@ fn test_window_partial_constant_and_set_monotonicity_25() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -847,7 +847,7 @@ fn test_window_partial_constant_and_set_monotonicity_26() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "#); } @@ -867,7 +867,7 @@ fn test_window_partial_constant_and_set_monotonicity_27() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "#); } @@ -893,7 +893,7 @@ fn test_window_partial_constant_and_set_monotonicity_28() { @ r#" Input / Optimized Plan: SortExec: expr=[count@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -912,15 +912,15 @@ fn test_window_partial_constant_and_set_monotonicity_29() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"#) + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "#) } // Case 30: @@ -937,7 +937,7 @@ fn test_window_partial_constant_and_set_monotonicity_30() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "#); } @@ -957,7 +957,7 @@ fn test_window_partial_constant_and_set_monotonicity_31() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -981,15 +981,15 @@ fn test_window_partial_constant_and_set_monotonicity_32() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1008,7 +1008,7 @@ fn test_window_partial_constant_and_set_monotonicity_33() { @ r#" Input / Optimized Plan: SortExec: expr=[max@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1027,15 +1027,15 @@ fn test_window_partial_constant_and_set_monotonicity_34() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } // Case 35: @@ -1053,7 +1053,7 @@ fn test_window_partial_constant_and_set_monotonicity_35() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1077,15 +1077,15 @@ fn test_window_partial_constant_and_set_monotonicity_36() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1102,15 +1102,15 @@ fn test_window_partial_constant_and_set_monotonicity_37() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1129,7 +1129,7 @@ fn test_window_partial_constant_and_set_monotonicity_38() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1149,7 +1149,7 @@ fn test_window_partial_constant_and_set_monotonicity_39() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1173,15 +1173,15 @@ fn test_window_partial_constant_and_set_monotonicity_40() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1200,7 +1200,7 @@ fn test_window_partial_constant_and_set_monotonicity_41() { @ r#" Input / Optimized Plan: SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1221,7 +1221,7 @@ fn test_window_partial_constant_and_set_monotonicity_42() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1242,7 +1242,7 @@ fn test_window_partial_constant_and_set_monotonicity_43() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1267,7 +1267,7 @@ fn test_window_partial_constant_and_set_monotonicity_44() { @ r#" Input / Optimized Plan: SortExec: expr=[count@2 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1288,7 +1288,7 @@ fn test_window_partial_constant_and_set_monotonicity_45() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1307,15 +1307,15 @@ fn test_window_partial_constant_and_set_monotonicity_46() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1331,15 +1331,15 @@ fn test_window_partial_constant_and_set_monotonicity_47() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1361,15 +1361,15 @@ fn test_window_partial_constant_and_set_monotonicity_48() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1387,7 +1387,7 @@ fn test_window_partial_constant_and_set_monotonicity_49() { @ r#" Input / Optimized Plan: SortExec: expr=[max@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1406,15 +1406,15 @@ fn test_window_partial_constant_and_set_monotonicity_50() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1432,7 +1432,7 @@ fn test_window_partial_constant_and_set_monotonicity_51() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1458,7 +1458,7 @@ fn test_window_partial_constant_and_set_monotonicity_52() { @ r#" Input / Optimized Plan: SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1479,7 +1479,7 @@ fn test_window_partial_constant_and_set_monotonicity_53() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1499,7 +1499,7 @@ fn test_window_partial_constant_and_set_monotonicity_54() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1517,15 +1517,15 @@ fn test_window_partial_constant_and_set_monotonicity_55() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1547,15 +1547,15 @@ fn test_window_partial_constant_and_set_monotonicity_56() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1574,7 +1574,7 @@ fn test_window_partial_constant_and_set_monotonicity_57() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1595,7 +1595,7 @@ fn test_window_partial_constant_and_set_monotonicity_58() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1615,7 +1615,7 @@ fn test_window_partial_constant_and_set_monotonicity_59() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1641,7 +1641,7 @@ fn test_window_partial_constant_and_set_monotonicity_60() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1662,7 +1662,7 @@ fn test_window_partial_constant_and_set_monotonicity_61() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1683,7 +1683,7 @@ fn test_window_partial_constant_and_set_monotonicity_62() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1701,15 +1701,15 @@ fn test_window_partial_constant_and_set_monotonicity_63() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } // =============================================REGION ENDS============================================= diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs index f05f3f00281d..7d8a9c7c2125 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs @@ -24,6 +24,7 @@ use datafusion_datasource::{ file_scan_config::FileScanConfigBuilder, file_stream::FileOpenFuture, file_stream::FileOpener, schema_adapter::DefaultSchemaAdapterFactory, schema_adapter::SchemaAdapterFactory, source::DataSourceExec, PartitionedFile, + TableSchema, }; use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_optimizer::PhysicalOptimizerRule; @@ -156,16 +157,20 @@ impl FileSource for TestSource { }) } - fn with_schema(&self, schema: SchemaRef) -> Arc { + fn with_schema(&self, schema: TableSchema) -> Arc { + assert!( + schema.table_partition_cols().is_empty(), + "TestSource does not support partition columns" + ); Arc::new(TestSource { - schema: Some(schema), + schema: Some(schema.file_schema().clone()), ..self.clone() }) } fn with_projection(&self, config: &FileScanConfig) -> Arc { Arc::new(TestSource { - projection: config.projection.clone(), + projection: config.projection_exprs.as_ref().map(|p| p.column_indices()), ..self.clone() }) } diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index c51a5e02c9c3..8631613c3925 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -390,7 +390,7 @@ fn create_simple_csv_exec() -> Arc { Arc::new(CsvSource::new(false, 0, 0)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_projection(Some(vec![0, 1, 2, 3, 4])) + .with_projection_indices(Some(vec![0, 1, 2, 3, 4])) .build(); DataSourceExec::from_data_source(config) @@ -409,7 +409,7 @@ fn create_projecting_csv_exec() -> Arc { Arc::new(CsvSource::new(false, 0, 0)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_projection(Some(vec![3, 2, 1])) + .with_projection_indices(Some(vec![3, 2, 1])) .build(); DataSourceExec::from_data_source(config) @@ -1596,7 +1596,7 @@ fn partitioned_data_source() -> Arc { ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_table_partition_cols(vec![Field::new("partition_col", DataType::Utf8, true)]) - .with_projection(Some(vec![0, 1, 2])) + .with_projection_indices(Some(vec![0, 1, 2])) .build(); DataSourceExec::from_data_source(config) diff --git a/datafusion/core/tests/physical_optimizer/sanity_checker.rs b/datafusion/core/tests/physical_optimizer/sanity_checker.rs index ce6eb13c86c4..9867ed173341 100644 --- a/datafusion/core/tests/physical_optimizer/sanity_checker.rs +++ b/datafusion/core/tests/physical_optimizer/sanity_checker.rs @@ -421,7 +421,7 @@ async fn test_bounded_window_agg_sort_requirement() -> Result<()> { assert_snapshot!( actual, @r#" - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] "# @@ -449,7 +449,7 @@ async fn test_bounded_window_agg_no_sort_requirement() -> Result<()> { assert_snapshot!( actual, @r#" - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: partitions=1, partition_sizes=[0] "# ); diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index 54a57ed90116..a7cc30a9484c 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -63,36 +63,59 @@ async fn explain_analyze_baseline_metrics() { "AggregateExec: mode=Partial, gby=[]", "metrics=[output_rows=3, elapsed_compute=" ); + assert_metrics!( + &formatted, + "AggregateExec: mode=Partial, gby=[]", + "output_bytes=" + ); assert_metrics!( &formatted, "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]", "metrics=[output_rows=5, elapsed_compute=" ); + assert_metrics!( + &formatted, + "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]", + "output_bytes=" + ); assert_metrics!( &formatted, "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434", "metrics=[output_rows=99, elapsed_compute=" ); + assert_metrics!( + &formatted, + "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434", + "output_bytes=" + ); assert_metrics!( &formatted, "ProjectionExec: expr=[]", "metrics=[output_rows=5, elapsed_compute=" ); + assert_metrics!(&formatted, "ProjectionExec: expr=[]", "output_bytes="); assert_metrics!( &formatted, "CoalesceBatchesExec: target_batch_size=4096", "metrics=[output_rows=5, elapsed_compute" ); + assert_metrics!( + &formatted, + "CoalesceBatchesExec: target_batch_size=4096", + "output_bytes=" + ); assert_metrics!( &formatted, "UnionExec", "metrics=[output_rows=3, elapsed_compute=" ); + assert_metrics!(&formatted, "UnionExec", "output_bytes="); assert_metrics!( &formatted, "WindowAggExec", "metrics=[output_rows=1, elapsed_compute=" ); + assert_metrics!(&formatted, "WindowAggExec", "output_bytes="); fn expected_to_have_metrics(plan: &dyn ExecutionPlan) -> bool { use datafusion::physical_plan; @@ -161,22 +184,35 @@ fn nanos_from_timestamp(ts: &Timestamp) -> i64 { } // Test different detail level for config `datafusion.explain.analyze_level` + +async fn collect_plan_with_context( + sql_str: &str, + ctx: &SessionContext, + level: ExplainAnalyzeLevel, +) -> String { + { + let state = ctx.state_ref(); + let mut state = state.write(); + state.config_mut().options_mut().explain.analyze_level = level; + } + let dataframe = ctx.sql(sql_str).await.unwrap(); + let batches = dataframe.collect().await.unwrap(); + arrow::util::pretty::pretty_format_batches(&batches) + .unwrap() + .to_string() +} + +async fn collect_plan(sql_str: &str, level: ExplainAnalyzeLevel) -> String { + let ctx = SessionContext::new(); + collect_plan_with_context(sql_str, &ctx, level).await +} + #[tokio::test] async fn explain_analyze_level() { - async fn collect_plan(level: ExplainAnalyzeLevel) -> String { - let mut config = SessionConfig::new(); - config.options_mut().explain.analyze_level = level; - let ctx = SessionContext::new_with_config(config); - let sql = "EXPLAIN ANALYZE \ + let sql = "EXPLAIN ANALYZE \ SELECT * \ FROM generate_series(10) as t1(v1) \ ORDER BY v1 DESC"; - let dataframe = ctx.sql(sql).await.unwrap(); - let batches = dataframe.collect().await.unwrap(); - arrow::util::pretty::pretty_format_batches(&batches) - .unwrap() - .to_string() - } for (level, needle, should_contain) in [ (ExplainAnalyzeLevel::Summary, "spill_count", false), @@ -184,7 +220,7 @@ async fn explain_analyze_level() { (ExplainAnalyzeLevel::Dev, "spill_count", true), (ExplainAnalyzeLevel::Dev, "output_rows", true), ] { - let plan = collect_plan(level).await; + let plan = collect_plan(sql, level).await; assert_eq!( plan.contains(needle), should_contain, @@ -193,6 +229,65 @@ async fn explain_analyze_level() { } } +#[tokio::test] +async fn explain_analyze_level_datasource_parquet() { + let table_name = "tpch_lineitem_small"; + let parquet_path = "tests/data/tpch_lineitem_small.parquet"; + let sql = format!("EXPLAIN ANALYZE SELECT * FROM {table_name}"); + + // Register test parquet file into context + let ctx = SessionContext::new(); + ctx.register_parquet(table_name, parquet_path, ParquetReadOptions::default()) + .await + .expect("register parquet table for explain analyze test"); + + for (level, needle, should_contain) in [ + (ExplainAnalyzeLevel::Summary, "metadata_load_time", true), + (ExplainAnalyzeLevel::Summary, "page_index_eval_time", false), + (ExplainAnalyzeLevel::Dev, "metadata_load_time", true), + (ExplainAnalyzeLevel::Dev, "page_index_eval_time", true), + ] { + let plan = collect_plan_with_context(&sql, &ctx, level).await; + + assert_eq!( + plan.contains(needle), + should_contain, + "plan for level {level:?} unexpected content: {plan}" + ); + } +} + +#[tokio::test] +async fn explain_analyze_parquet_pruning_metrics() { + let table_name = "tpch_lineitem_small"; + let parquet_path = "tests/data/tpch_lineitem_small.parquet"; + let ctx = SessionContext::new(); + ctx.register_parquet(table_name, parquet_path, ParquetReadOptions::default()) + .await + .expect("register parquet table for explain analyze test"); + + // Test scenario: + // This table's l_orderkey has range [1, 7] + // So the following query can't prune the file: + // select * from tpch_lineitem_small where l_orderkey = 5; + // If change filter to `l_orderkey=10`, the whole file can be pruned using stat. + for (l_orderkey, expected_pruning_metrics) in + [(5, "1 total → 1 matched"), (10, "1 total → 0 matched")] + { + let sql = format!( + "explain analyze select * from {table_name} where l_orderkey = {l_orderkey};" + ); + + let plan = + collect_plan_with_context(&sql, &ctx, ExplainAnalyzeLevel::Summary).await; + + let expected_metrics = + format!("files_ranges_pruned_statistics={expected_pruning_metrics}"); + + assert_metrics!(&plan, "DataSourceExec", &expected_metrics); + } +} + #[tokio::test] async fn csv_explain_plans() { // This test verify the look of each plan in its full cycle plan creation diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index e212ee269b15..743c8750b521 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -43,7 +43,6 @@ use tempfile::TempDir; /// A macro to assert that some particular line contains two substrings /// /// Usage: `assert_metrics!(actual, operator_name, metrics)` -/// macro_rules! assert_metrics { ($ACTUAL: expr, $OPERATOR_NAME: expr, $METRICS: expr) => { let found = $ACTUAL diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs index 98c3e3ccee8a..8a0f62062738 100644 --- a/datafusion/core/tests/sql/select.rs +++ b/datafusion/core/tests/sql/select.rs @@ -15,8 +15,11 @@ // specific language governing permissions and limitations // under the License. +use std::collections::HashMap; + use super::*; -use datafusion_common::ScalarValue; +use datafusion::assert_batches_eq; +use datafusion_common::{metadata::ScalarAndMetadata, ParamValues, ScalarValue}; use insta::assert_snapshot; #[tokio::test] @@ -219,11 +222,11 @@ async fn test_parameter_invalid_types() -> Result<()> { .collect() .await; assert_snapshot!(results.unwrap_err().strip_backtrace(), - @r#" - type_coercion - caused by - Error during planning: Cannot infer common argument type for comparison operation List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) = Int32 - "#); + @r" + type_coercion + caused by + Error during planning: Cannot infer common argument type for comparison operation List(nullable Int32) = Int32 + "); Ok(()) } @@ -317,6 +320,53 @@ async fn test_named_parameter_not_bound() -> Result<()> { Ok(()) } +#[tokio::test] +async fn test_query_parameters_with_metadata() -> Result<()> { + let ctx = SessionContext::new(); + + let df = ctx.sql("SELECT $1, $2").await.unwrap(); + + let metadata1 = HashMap::from([("some_key".to_string(), "some_value".to_string())]); + let metadata2 = + HashMap::from([("some_other_key".to_string(), "some_other_value".to_string())]); + + let df_with_params_replaced = df + .with_param_values(ParamValues::List(vec![ + ScalarAndMetadata::new( + ScalarValue::UInt32(Some(1)), + Some(metadata1.clone().into()), + ), + ScalarAndMetadata::new( + ScalarValue::Utf8(Some("two".to_string())), + Some(metadata2.clone().into()), + ), + ])) + .unwrap(); + + // df_with_params_replaced.schema() is not correct here + // https://github.com/apache/datafusion/issues/18102 + let batches = df_with_params_replaced.clone().collect().await.unwrap(); + let schema = batches[0].schema(); + + assert_eq!(schema.field(0).data_type(), &DataType::UInt32); + assert_eq!(schema.field(0).metadata(), &metadata1); + assert_eq!(schema.field(1).data_type(), &DataType::Utf8); + assert_eq!(schema.field(1).metadata(), &metadata2); + + assert_batches_eq!( + [ + "+----+-----+", + "| $1 | $2 |", + "+----+-----+", + "| 1 | two |", + "+----+-----+", + ], + &batches + ); + + Ok(()) +} + #[tokio::test] async fn test_version_function() { let expected_version = format!( diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs index f0bf15d3483b..ffe0ba021edb 100644 --- a/datafusion/core/tests/user_defined/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined/user_defined_plan.rs @@ -56,7 +56,6 @@ //! //! The same answer can be produced by simply keeping track of the top //! N elements, reducing the total amount of required buffer memory. -//! use std::fmt::Debug; use std::hash::Hash; diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index f1af66de9b59..fb1371da6ceb 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -34,13 +34,13 @@ use datafusion::execution::context::{FunctionFactory, RegisterFunction, SessionS use datafusion::prelude::*; use datafusion::{execution::registry::FunctionRegistry, test_util}; use datafusion_common::cast::{as_float64_array, as_int32_array}; +use datafusion_common::metadata::FieldMetadata; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::utils::take_function_args; use datafusion_common::{ assert_batches_eq, assert_batches_sorted_eq, assert_contains, exec_datafusion_err, exec_err, not_impl_err, plan_err, DFSchema, DataFusionError, Result, ScalarValue, }; -use datafusion_expr::expr::FieldMetadata; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ lit_with_metadata, Accumulator, ColumnarValue, CreateFunction, CreateFunctionBody, diff --git a/datafusion/datasource-arrow/src/source.rs b/datafusion/datasource-arrow/src/source.rs index f43f11880182..f254b7e3ff30 100644 --- a/datafusion/datasource-arrow/src/source.rs +++ b/datafusion/datasource-arrow/src/source.rs @@ -20,9 +20,9 @@ use std::sync::Arc; use datafusion_datasource::as_file_source; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; +use datafusion_datasource::TableSchema; use arrow::buffer::Buffer; -use arrow::datatypes::SchemaRef; use arrow_ipc::reader::FileDecoder; use datafusion_common::error::Result; use datafusion_common::{exec_datafusion_err, Statistics}; @@ -73,7 +73,7 @@ impl FileSource for ArrowSource { Arc::new(Self { ..self.clone() }) } - fn with_schema(&self, _schema: SchemaRef) -> Arc { + fn with_schema(&self, _schema: TableSchema) -> Arc { Arc::new(Self { ..self.clone() }) } fn with_statistics(&self, statistics: Statistics) -> Arc { diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml index e013e8a3d093..6bab899e7f97 100644 --- a/datafusion/datasource-avro/Cargo.toml +++ b/datafusion/datasource-avro/Cargo.toml @@ -41,7 +41,7 @@ datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-session = { workspace = true } futures = { workspace = true } -num-traits = { version = "0.2" } +num-traits = { workspace = true } object_store = { workspace = true } [dev-dependencies] diff --git a/datafusion/datasource-avro/src/avro_to_arrow/reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/reader.rs index 9a4d13fc191d..5ef35e2bee89 100644 --- a/datafusion/datasource-avro/src/avro_to_arrow/reader.rs +++ b/datafusion/datasource-avro/src/avro_to_arrow/reader.rs @@ -64,13 +64,9 @@ impl ReaderBuilder { /// let file = File::open("test/data/basic.avro").unwrap(); /// /// // create a builder, inferring the schema with the first 100 records - /// let builder = ReaderBuilder::new() - /// .read_schema() - /// .with_batch_size(100); + /// let builder = ReaderBuilder::new().read_schema().with_batch_size(100); /// - /// let reader = builder - /// .build::(file) - /// .unwrap(); + /// let reader = builder.build::(file).unwrap(); /// /// reader /// } diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 0916222337b8..1ff73d2c3cc3 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -29,6 +29,7 @@ use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::file_stream::FileOpener; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; +use datafusion_datasource::TableSchema; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; @@ -84,11 +85,13 @@ impl FileSource for AvroSource { Arc::new(conf) } - fn with_schema(&self, schema: SchemaRef) -> Arc { + fn with_schema(&self, schema: TableSchema) -> Arc { let mut conf = self.clone(); - conf.schema = Some(schema); + // TableSchema may have partition columns, but AvroSource does not use partition columns or values atm + conf.schema = Some(Arc::clone(schema.file_schema())); Arc::new(conf) } + fn with_statistics(&self, statistics: Statistics) -> Arc { let mut conf = self.clone(); conf.projected_statistics = Some(statistics); diff --git a/datafusion/datasource-csv/src/source.rs b/datafusion/datasource-csv/src/source.rs index 0445329d0653..0b18571e58bd 100644 --- a/datafusion/datasource-csv/src/source.rs +++ b/datafusion/datasource-csv/src/source.rs @@ -29,7 +29,7 @@ use datafusion_datasource::file_compression_type::FileCompressionType; use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener}; use datafusion_datasource::{ as_file_source, calculate_range, FileRange, ListingTableUrl, PartitionedFile, - RangeCalculation, + RangeCalculation, TableSchema, }; use arrow::csv; @@ -258,9 +258,9 @@ impl FileSource for CsvSource { Arc::new(conf) } - fn with_schema(&self, schema: SchemaRef) -> Arc { + fn with_schema(&self, schema: TableSchema) -> Arc { let mut conf = self.clone(); - conf.file_schema = Some(schema); + conf.file_schema = Some(Arc::clone(schema.file_schema())); Arc::new(conf) } diff --git a/datafusion/datasource-json/src/source.rs b/datafusion/datasource-json/src/source.rs index 0b1eee1dac58..52ed0def03f1 100644 --- a/datafusion/datasource-json/src/source.rs +++ b/datafusion/datasource-json/src/source.rs @@ -32,6 +32,7 @@ use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener}; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; use datafusion_datasource::{ as_file_source, calculate_range, ListingTableUrl, PartitionedFile, RangeCalculation, + TableSchema, }; use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; @@ -122,7 +123,7 @@ impl FileSource for JsonSource { Arc::new(conf) } - fn with_schema(&self, _schema: SchemaRef) -> Arc { + fn with_schema(&self, _schema: TableSchema) -> Arc { Arc::new(Self { ..self.clone() }) } fn with_statistics(&self, statistics: Statistics) -> Arc { diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 963c1d77950c..f27bda387fda 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -38,8 +38,6 @@ use datafusion_datasource::write::demux::DemuxedStreamReceiver; use arrow::datatypes::{DataType, Field, FieldRef}; use datafusion_common::config::{ConfigField, ConfigFileType, TableParquetOptions}; -#[cfg(feature = "parquet_encryption")] -use datafusion_common::encryption::map_config_decryption_to_decryption; use datafusion_common::encryption::FileDecryptionProperties; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ @@ -59,11 +57,13 @@ use datafusion_physical_expr_common::sort_expr::LexRequirement; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; use datafusion_session::Session; +use crate::metadata::DFParquetMetadata; use crate::reader::CachedParquetFileReaderFactory; use crate::source::{parse_coerce_int96_string, ParquetSource}; use async_trait::async_trait; use bytes::Bytes; use datafusion_datasource::source::DataSourceExec; +use datafusion_execution::cache::cache_manager::FileMetadataCache; use datafusion_execution::runtime_env::RuntimeEnv; use futures::future::BoxFuture; use futures::{FutureExt, StreamExt, TryStreamExt}; @@ -77,14 +77,12 @@ use parquet::arrow::arrow_writer::{ use parquet::arrow::async_reader::MetadataFetch; use parquet::arrow::{ArrowWriter, AsyncArrowWriter}; use parquet::basic::Type; - -use crate::metadata::DFParquetMetadata; -use datafusion_execution::cache::cache_manager::FileMetadataCache; +#[cfg(feature = "parquet_encryption")] +use parquet::encryption::encrypt::FileEncryptionProperties; use parquet::errors::ParquetError; use parquet::file::metadata::ParquetMetaData; use parquet::file::properties::{WriterProperties, WriterPropertiesBuilder}; use parquet::file::writer::SerializedFileWriter; -use parquet::format::FileMetaData; use parquet::schema::types::SchemaDescriptor; use tokio::io::{AsyncWrite, AsyncWriteExt}; use tokio::sync::mpsc::{self, Receiver, Sender}; @@ -306,25 +304,23 @@ async fn get_file_decryption_properties( state: &dyn Session, options: &TableParquetOptions, file_path: &Path, -) -> Result> { - let file_decryption_properties: Option = - match &options.crypto.file_decryption { - Some(cfd) => Some(map_config_decryption_to_decryption(cfd)), - None => match &options.crypto.factory_id { - Some(factory_id) => { - let factory = - state.runtime_env().parquet_encryption_factory(factory_id)?; - factory - .get_file_decryption_properties( - &options.crypto.factory_options, - file_path, - ) - .await? - } - None => None, - }, - }; - Ok(file_decryption_properties) +) -> Result>> { + Ok(match &options.crypto.file_decryption { + Some(cfd) => Some(Arc::new(FileDecryptionProperties::from(cfd.clone()))), + None => match &options.crypto.factory_id { + Some(factory_id) => { + let factory = + state.runtime_env().parquet_encryption_factory(factory_id)?; + factory + .get_file_decryption_properties( + &options.crypto.factory_options, + file_path, + ) + .await? + } + None => None, + }, + }) } #[cfg(not(feature = "parquet_encryption"))] @@ -332,7 +328,7 @@ async fn get_file_decryption_properties( _state: &dyn Session, _options: &TableParquetOptions, _file_path: &Path, -) -> Result> { +) -> Result>> { Ok(None) } @@ -385,7 +381,7 @@ impl FileFormat for ParquetFormat { .await?; let result = DFParquetMetadata::new(store.as_ref(), object) .with_metadata_size_hint(self.metadata_size_hint()) - .with_decryption_properties(file_decryption_properties.as_ref()) + .with_decryption_properties(file_decryption_properties) .with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache))) .with_coerce_int96(coerce_int96) .fetch_schema_with_location() @@ -446,7 +442,7 @@ impl FileFormat for ParquetFormat { state.runtime_env().cache_manager.get_file_metadata_cache(); DFParquetMetadata::new(store, object) .with_metadata_size_hint(self.metadata_size_hint()) - .with_decryption_properties(file_decryption_properties.as_ref()) + .with_decryption_properties(file_decryption_properties) .with_file_metadata_cache(Some(file_metadata_cache)) .fetch_statistics(&table_schema) .await @@ -1027,9 +1023,10 @@ pub async fn fetch_parquet_metadata( store: &dyn ObjectStore, object_meta: &ObjectMeta, size_hint: Option, - #[allow(unused)] decryption_properties: Option<&FileDecryptionProperties>, + decryption_properties: Option<&FileDecryptionProperties>, file_metadata_cache: Option>, ) -> Result> { + let decryption_properties = decryption_properties.cloned().map(Arc::new); DFParquetMetadata::new(store, object_meta) .with_metadata_size_hint(size_hint) .with_decryption_properties(decryption_properties) @@ -1053,6 +1050,7 @@ pub async fn fetch_statistics( decryption_properties: Option<&FileDecryptionProperties>, file_metadata_cache: Option>, ) -> Result { + let decryption_properties = decryption_properties.cloned().map(Arc::new); DFParquetMetadata::new(store, file) .with_metadata_size_hint(metadata_size_hint) .with_decryption_properties(decryption_properties) @@ -1080,7 +1078,7 @@ pub struct ParquetSink { parquet_options: TableParquetOptions, /// File metadata from successfully produced parquet files. The Mutex is only used /// to allow inserting to HashMap from behind borrowed reference in DataSink::write_all. - written: Arc>>, + written: Arc>>, } impl Debug for ParquetSink { @@ -1117,7 +1115,7 @@ impl ParquetSink { /// Retrieve the file metadata for the written files, keyed to the path /// which may be partitioned (in the case of hive style partitioning). - pub fn written(&self) -> HashMap { + pub fn written(&self) -> HashMap { self.written.lock().clone() } @@ -1141,7 +1139,7 @@ impl ParquetSink { builder = set_writer_encryption_properties( builder, runtime, - &parquet_opts, + parquet_opts, schema, path, ) @@ -1189,14 +1187,15 @@ impl ParquetSink { async fn set_writer_encryption_properties( builder: WriterPropertiesBuilder, runtime: &Arc, - parquet_opts: &TableParquetOptions, + parquet_opts: TableParquetOptions, schema: &Arc, path: &Path, ) -> Result { - if let Some(file_encryption_properties) = &parquet_opts.crypto.file_encryption { + if let Some(file_encryption_properties) = parquet_opts.crypto.file_encryption { // Encryption properties have been specified directly - return Ok(builder - .with_file_encryption_properties(file_encryption_properties.clone().into())); + return Ok(builder.with_file_encryption_properties(Arc::new( + FileEncryptionProperties::from(file_encryption_properties), + ))); } else if let Some(encryption_factory_id) = &parquet_opts.crypto.factory_id.as_ref() { // Encryption properties will be generated by an encryption factory let encryption_factory = @@ -1221,7 +1220,7 @@ async fn set_writer_encryption_properties( async fn set_writer_encryption_properties( builder: WriterPropertiesBuilder, _runtime: &Arc, - _parquet_opts: &TableParquetOptions, + _parquet_opts: TableParquetOptions, _schema: &Arc, _path: &Path, ) -> Result { @@ -1244,7 +1243,7 @@ impl FileSink for ParquetSink { let parquet_opts = &self.parquet_options; let mut file_write_tasks: JoinSet< - std::result::Result<(Path, FileMetaData), DataFusionError>, + std::result::Result<(Path, ParquetMetaData), DataFusionError>, > = JoinSet::new(); let runtime = context.runtime_env(); @@ -1275,11 +1274,11 @@ impl FileSink for ParquetSink { writer.write(&batch).await?; reservation.try_resize(writer.memory_size())?; } - let file_metadata = writer + let parquet_meta_data = writer .close() .await .map_err(|e| DataFusionError::ParquetError(Box::new(e)))?; - Ok((path, file_metadata)) + Ok((path, parquet_meta_data)) }); } else { let writer = ObjectWriterBuilder::new( @@ -1303,7 +1302,7 @@ impl FileSink for ParquetSink { let parallel_options_clone = parallel_options.clone(); let pool = Arc::clone(context.memory_pool()); file_write_tasks.spawn(async move { - let file_metadata = output_single_parquet_file_parallelized( + let parquet_meta_data = output_single_parquet_file_parallelized( writer, rx, schema, @@ -1313,7 +1312,7 @@ impl FileSink for ParquetSink { pool, ) .await?; - Ok((path, file_metadata)) + Ok((path, parquet_meta_data)) }); } } @@ -1322,11 +1321,11 @@ impl FileSink for ParquetSink { while let Some(result) = file_write_tasks.join_next().await { match result { Ok(r) => { - let (path, file_metadata) = r?; - row_count += file_metadata.num_rows; + let (path, parquet_meta_data) = r?; + row_count += parquet_meta_data.file_metadata().num_rows(); let mut written_files = self.written.lock(); written_files - .try_insert(path.clone(), file_metadata) + .try_insert(path.clone(), parquet_meta_data) .map_err(|e| internal_datafusion_err!("duplicate entry detected for partitioned file {path}: {e}"))?; drop(written_files); } @@ -1589,7 +1588,7 @@ async fn concatenate_parallel_row_groups( mut serialize_rx: Receiver>, mut object_store_writer: Box, pool: Arc, -) -> Result { +) -> Result { let mut file_reservation = MemoryConsumer::new("ParquetSink(SerializedFileWriter)").register(&pool); @@ -1617,14 +1616,14 @@ async fn concatenate_parallel_row_groups( rg_out.close()?; } - let file_metadata = parquet_writer.close()?; + let parquet_meta_data = parquet_writer.close()?; let final_buff = merged_buff.buffer.try_lock().unwrap(); object_store_writer.write_all(final_buff.as_slice()).await?; object_store_writer.shutdown().await?; file_reservation.free(); - Ok(file_metadata) + Ok(parquet_meta_data) } /// Parallelizes the serialization of a single parquet file, by first serializing N @@ -1639,7 +1638,7 @@ async fn output_single_parquet_file_parallelized( skip_arrow_metadata: bool, parallel_options: ParallelParquetWriterOptions, pool: Arc, -) -> Result { +) -> Result { let max_rowgroups = parallel_options.max_parallel_row_groups; // Buffer size of this channel limits maximum number of RowGroups being worked on in parallel let (serialize_tx, serialize_rx) = @@ -1666,7 +1665,7 @@ async fn output_single_parquet_file_parallelized( parallel_options, Arc::clone(&pool), ); - let file_metadata = concatenate_parallel_row_groups( + let parquet_meta_data = concatenate_parallel_row_groups( writer, merged_buff, serialize_rx, @@ -1679,7 +1678,7 @@ async fn output_single_parquet_file_parallelized( .join_unwind() .await .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??; - Ok(file_metadata) + Ok(parquet_meta_data) } #[cfg(test)] diff --git a/datafusion/datasource-parquet/src/metadata.rs b/datafusion/datasource-parquet/src/metadata.rs index 4de68793ce02..6505a447d7ce 100644 --- a/datafusion/datasource-parquet/src/metadata.rs +++ b/datafusion/datasource-parquet/src/metadata.rs @@ -58,7 +58,7 @@ pub struct DFParquetMetadata<'a> { store: &'a dyn ObjectStore, object_meta: &'a ObjectMeta, metadata_size_hint: Option, - decryption_properties: Option<&'a FileDecryptionProperties>, + decryption_properties: Option>, file_metadata_cache: Option>, /// timeunit to coerce INT96 timestamps to pub coerce_int96: Option, @@ -85,7 +85,7 @@ impl<'a> DFParquetMetadata<'a> { /// set decryption properties pub fn with_decryption_properties( mut self, - decryption_properties: Option<&'a FileDecryptionProperties>, + decryption_properties: Option>, ) -> Self { self.decryption_properties = decryption_properties; self @@ -145,7 +145,8 @@ impl<'a> DFParquetMetadata<'a> { #[cfg(feature = "parquet_encryption")] if let Some(decryption_properties) = decryption_properties { - reader = reader.with_decryption_properties(Some(decryption_properties)); + reader = reader + .with_decryption_properties(Some(Arc::clone(decryption_properties))); } if cache_metadata && file_metadata_cache.is_some() { @@ -299,7 +300,6 @@ impl<'a> DFParquetMetadata<'a> { summarize_min_max_null_counts( &mut accumulators, idx, - num_rows, &stats_converter, row_groups_metadata, ) @@ -417,7 +417,6 @@ struct StatisticsAccumulators<'a> { fn summarize_min_max_null_counts( accumulators: &mut StatisticsAccumulators, arrow_schema_index: usize, - num_rows: usize, stats_converter: &StatisticsConverter, row_groups_metadata: &[RowGroupMetaData], ) -> Result<()> { @@ -449,11 +448,14 @@ fn summarize_min_max_null_counts( ); } - accumulators.null_counts_array[arrow_schema_index] = - Precision::Exact(match sum(&null_counts) { - Some(null_count) => null_count as usize, - None => num_rows, - }); + accumulators.null_counts_array[arrow_schema_index] = match sum(&null_counts) { + Some(null_count) => Precision::Exact(null_count as usize), + None => match null_counts.len() { + // If sum() returned None we either have no rows or all values are null + 0 => Precision::Exact(0), + _ => Precision::Absent, + }, + }; Ok(()) } diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs index d75a979d4cad..9d86a3ae9f2d 100644 --- a/datafusion/datasource-parquet/src/metrics.rs +++ b/datafusion/datasource-parquet/src/metrics.rs @@ -16,7 +16,7 @@ // under the License. use datafusion_physical_plan::metrics::{ - Count, ExecutionPlanMetricsSet, MetricBuilder, Time, + Count, ExecutionPlanMetricsSet, MetricBuilder, MetricType, PruningMetrics, Time, }; /// Stores metrics about the parquet execution for a particular parquet file. @@ -27,7 +27,7 @@ use datafusion_physical_plan::metrics::{ /// [`ParquetFileReaderFactory`]: super::ParquetFileReaderFactory #[derive(Debug, Clone)] pub struct ParquetFileMetrics { - /// Number of file **ranges** pruned by partition or file level statistics. + /// Number of file **ranges** pruned or matched by partition or file level statistics. /// Pruning of files often happens at planning time but may happen at execution time /// if dynamic filters (e.g. from a join) result in additional pruning. /// @@ -41,7 +41,7 @@ pub struct ParquetFileMetrics { /// pushdown optimization may fill up the TopK heap when reading the first part of a file, /// then skip the second part if file statistics indicate it cannot contain rows /// that would be in the TopK. - pub files_ranges_pruned_statistics: Count, + pub files_ranges_pruned_statistics: PruningMetrics, /// Number of times the predicate could not be evaluated pub predicate_evaluation_errors: Count, /// Number of row groups whose bloom filters were checked and matched (not pruned) @@ -88,30 +88,59 @@ impl ParquetFileMetrics { filename: &str, metrics: &ExecutionPlanMetricsSet, ) -> Self { - let predicate_evaluation_errors = MetricBuilder::new(metrics) - .with_new_label("filename", filename.to_string()) - .counter("predicate_evaluation_errors", partition); - + // ----------------------- + // 'summary' level metrics + // ----------------------- let row_groups_matched_bloom_filter = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) + .with_type(MetricType::SUMMARY) .counter("row_groups_matched_bloom_filter", partition); let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) + .with_type(MetricType::SUMMARY) .counter("row_groups_pruned_bloom_filter", partition); let row_groups_matched_statistics = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) + .with_type(MetricType::SUMMARY) .counter("row_groups_matched_statistics", partition); let row_groups_pruned_statistics = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) + .with_type(MetricType::SUMMARY) .counter("row_groups_pruned_statistics", partition); + let page_index_rows_pruned = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .with_type(MetricType::SUMMARY) + .counter("page_index_rows_pruned", partition); + let page_index_rows_matched = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .with_type(MetricType::SUMMARY) + .counter("page_index_rows_matched", partition); + let bytes_scanned = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) + .with_type(MetricType::SUMMARY) .counter("bytes_scanned", partition); + let metadata_load_time = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .with_type(MetricType::SUMMARY) + .subset_time("metadata_load_time", partition); + + let files_ranges_pruned_statistics = MetricBuilder::new(metrics) + .with_type(MetricType::SUMMARY) + .pruning_metrics("files_ranges_pruned_statistics", partition); + + // ----------------------- + // 'dev' level metrics + // ----------------------- + let predicate_evaluation_errors = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .counter("predicate_evaluation_errors", partition); + let pushdown_rows_pruned = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) .counter("pushdown_rows_pruned", partition); @@ -129,24 +158,10 @@ impl ParquetFileMetrics { .with_new_label("filename", filename.to_string()) .subset_time("bloom_filter_eval_time", partition); - let page_index_rows_pruned = MetricBuilder::new(metrics) - .with_new_label("filename", filename.to_string()) - .counter("page_index_rows_pruned", partition); - let page_index_rows_matched = MetricBuilder::new(metrics) - .with_new_label("filename", filename.to_string()) - .counter("page_index_rows_matched", partition); - let page_index_eval_time = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) .subset_time("page_index_eval_time", partition); - let metadata_load_time = MetricBuilder::new(metrics) - .with_new_label("filename", filename.to_string()) - .subset_time("metadata_load_time", partition); - - let files_ranges_pruned_statistics = MetricBuilder::new(metrics) - .counter("files_ranges_pruned_statistics", partition); - let predicate_cache_inner_records = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) .counter("predicate_cache_inner_records", partition); diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 167fc3c5147e..1c9b9feb9f50 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -40,7 +40,9 @@ use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory; use datafusion_physical_expr_common::physical_expr::{ is_dynamic_physical_expr, PhysicalExpr, }; -use datafusion_physical_plan::metrics::{Count, ExecutionPlanMetricsSet, MetricBuilder}; +use datafusion_physical_plan::metrics::{ + Count, ExecutionPlanMetricsSet, MetricBuilder, PruningMetrics, +}; use datafusion_pruning::{build_pruning_predicate, FilePruner, PruningPredicate}; #[cfg(feature = "parquet_encryption")] @@ -195,11 +197,13 @@ impl FileOpener for ParquetOpener { if let Some(file_pruner) = &mut file_pruner { if file_pruner.should_prune()? { // Return an empty stream immediately to skip the work of setting up the actual stream - file_metrics.files_ranges_pruned_statistics.add(1); + file_metrics.files_ranges_pruned_statistics.add_pruned(1); return Ok(futures::stream::empty().boxed()); } } + file_metrics.files_ranges_pruned_statistics.add_matched(1); + // Don't load the page index yet. Since it is not stored inline in // the footer, loading the page index if it is not needed will do // unnecessary I/O. We decide later if it is needed to evaluate the @@ -208,7 +212,7 @@ impl FileOpener for ParquetOpener { let mut options = ArrowReaderOptions::new().with_page_index(false); #[cfg(feature = "parquet_encryption")] if let Some(fd_val) = file_decryption_properties { - options = options.with_file_decryption_properties((*fd_val).clone()); + options = options.with_file_decryption_properties(Arc::clone(&fd_val)); } let mut metadata_timer = file_metrics.metadata_load_time.timer(); @@ -480,7 +484,7 @@ struct EarlyStoppingStream { /// None done: bool, file_pruner: FilePruner, - files_ranges_pruned_statistics: Count, + files_ranges_pruned_statistics: PruningMetrics, /// The inner stream inner: S, } @@ -489,7 +493,7 @@ impl EarlyStoppingStream { pub fn new( stream: S, file_pruner: FilePruner, - files_ranges_pruned_statistics: Count, + files_ranges_pruned_statistics: PruningMetrics, ) -> Self { Self { done: false, @@ -509,7 +513,9 @@ where // Since dynamic filters may have been updated, see if we can stop // reading this stream entirely. if self.file_pruner.should_prune()? { - self.files_ranges_pruned_statistics.add(1); + self.files_ranges_pruned_statistics.add_pruned(1); + // Previously this file range has been counted as matched + self.files_ranges_pruned_statistics.subtract_matched(1); self.done = true; Ok(None) } else { @@ -581,8 +587,7 @@ impl EncryptionContext { None => match &self.encryption_factory { Some((encryption_factory, encryption_config)) => Ok(encryption_factory .get_file_decryption_properties(encryption_config, file_location) - .await? - .map(Arc::new)), + .await?), None => Ok(None), }, } diff --git a/datafusion/datasource-parquet/src/page_filter.rs b/datafusion/datasource-parquet/src/page_filter.rs index 5f3e05747d40..82deedd406ce 100644 --- a/datafusion/datasource-parquet/src/page_filter.rs +++ b/datafusion/datasource-parquet/src/page_filter.rs @@ -36,7 +36,7 @@ use datafusion_pruning::PruningPredicate; use log::{debug, trace}; use parquet::arrow::arrow_reader::statistics::StatisticsConverter; use parquet::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex}; -use parquet::format::PageLocation; +use parquet::file::page_index::offset_index::PageLocation; use parquet::schema::types::SchemaDescriptor; use parquet::{ arrow::arrow_reader::{RowSelection, RowSelector}, @@ -90,7 +90,6 @@ use parquet::{ /// ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━┛ /// /// Total rows: 300 -/// /// ``` /// /// Given the predicate `A > 35 AND B = 'F'`: diff --git a/datafusion/datasource-parquet/src/reader.rs b/datafusion/datasource-parquet/src/reader.rs index 687a7f15fccc..88a3cea5623b 100644 --- a/datafusion/datasource-parquet/src/reader.rs +++ b/datafusion/datasource-parquet/src/reader.rs @@ -262,8 +262,9 @@ impl AsyncFileReader for CachedParquetFileReader { async move { #[cfg(feature = "parquet_encryption")] - let file_decryption_properties = - options.and_then(|o| o.file_decryption_properties()); + let file_decryption_properties = options + .and_then(|o| o.file_decryption_properties()) + .map(Arc::clone); #[cfg(not(feature = "parquet_encryption"))] let file_decryption_properties = None; diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 20d71692926f..edc9c65450ec 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -35,11 +35,12 @@ use datafusion_datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapterFactory, }; -use arrow::datatypes::{SchemaRef, TimeUnit}; +use arrow::datatypes::TimeUnit; use datafusion_common::config::TableParquetOptions; use datafusion_common::{DataFusionError, Statistics}; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::TableSchema; use datafusion_physical_expr::conjunction; use datafusion_physical_expr_adapter::DefaultPhysicalExprAdapterFactory; use datafusion_physical_expr_common::physical_expr::fmt_sql; @@ -52,12 +53,12 @@ use datafusion_physical_plan::metrics::Count; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::DisplayFormatType; -#[cfg(feature = "parquet_encryption")] -use datafusion_common::encryption::map_config_decryption_to_decryption; #[cfg(feature = "parquet_encryption")] use datafusion_execution::parquet_encryption::EncryptionFactory; use itertools::Itertools; use object_store::ObjectStore; +#[cfg(feature = "parquet_encryption")] +use parquet::encryption::decrypt::FileDecryptionProperties; /// Execution plan for reading one or more Parquet files. /// @@ -85,7 +86,6 @@ use object_store::ObjectStore; /// │.───────────────────.│ /// │ ) /// `───────────────────' -/// /// ``` /// /// # Example: Create a `DataSourceExec` @@ -275,7 +275,7 @@ pub struct ParquetSource { /// The schema of the file. /// In particular, this is the schema of the table without partition columns, /// *not* the physical schema of the file. - pub(crate) file_schema: Option, + pub(crate) table_schema: Option, /// Optional predicate for row filtering during parquet scan pub(crate) predicate: Option>, /// Optional user defined parquet file reader factory @@ -349,7 +349,6 @@ impl ParquetSource { } /// Optional user defined parquet file reader factory. - /// pub fn with_parquet_file_reader_factory( mut self, parquet_file_reader_factory: Arc, @@ -547,8 +546,8 @@ impl FileSource for ParquetSource { .table_parquet_options() .crypto .file_decryption - .as_ref() - .map(map_config_decryption_to_decryption) + .clone() + .map(FileDecryptionProperties::from) .map(Arc::new); let coerce_int96 = self @@ -601,9 +600,9 @@ impl FileSource for ParquetSource { Arc::new(conf) } - fn with_schema(&self, schema: SchemaRef) -> Arc { + fn with_schema(&self, schema: TableSchema) -> Arc { Arc::new(Self { - file_schema: Some(schema), + table_schema: Some(schema), ..self.clone() }) } @@ -661,9 +660,10 @@ impl FileSource for ParquetSource { // the actual predicates are built in reference to the physical schema of // each file, which we do not have at this point and hence cannot use. // Instead we use the logical schema of the file (the table schema without partition columns). - if let (Some(file_schema), Some(predicate)) = - (&self.file_schema, &self.predicate) - { + if let (Some(file_schema), Some(predicate)) = ( + &self.table_schema.as_ref().map(|ts| ts.file_schema()), + &self.predicate, + ) { let predicate_creation_errors = Count::new(); if let (Some(pruning_predicate), _) = build_pruning_predicates( Some(predicate), @@ -700,7 +700,12 @@ impl FileSource for ParquetSource { filters: Vec>, config: &ConfigOptions, ) -> datafusion_common::Result>> { - let Some(file_schema) = self.file_schema.clone() else { + let Some(table_schema) = self + .table_schema + .as_ref() + .map(|ts| ts.table_schema()) + .cloned() + else { return Ok(FilterPushdownPropagation::with_parent_pushdown_result( vec![PushedDown::No; filters.len()], )); @@ -720,7 +725,7 @@ impl FileSource for ParquetSource { let filters: Vec = filters .into_iter() .map(|filter| { - if can_expr_be_pushed_down_with_schemas(&filter, &file_schema) { + if can_expr_be_pushed_down_with_schemas(&filter, &table_schema) { PushedDownPredicate::supported(filter) } else { PushedDownPredicate::unsupported(filter) diff --git a/datafusion/datasource/src/file.rs b/datafusion/datasource/src/file.rs index 7a2cf403fd8d..d6ade3b8b210 100644 --- a/datafusion/datasource/src/file.rs +++ b/datafusion/datasource/src/file.rs @@ -26,7 +26,7 @@ use crate::file_groups::FileGroupPartitioner; use crate::file_scan_config::FileScanConfig; use crate::file_stream::FileOpener; use crate::schema_adapter::SchemaAdapterFactory; -use arrow::datatypes::SchemaRef; +use crate::TableSchema; use datafusion_common::config::ConfigOptions; use datafusion_common::{not_impl_err, Result, Statistics}; use datafusion_physical_expr::{LexOrdering, PhysicalExpr}; @@ -64,7 +64,7 @@ pub trait FileSource: Send + Sync { /// Initialize new type with batch size configuration fn with_batch_size(&self, batch_size: usize) -> Arc; /// Initialize new instance with a new schema - fn with_schema(&self, schema: SchemaRef) -> Arc; + fn with_schema(&self, schema: TableSchema) -> Arc; /// Initialize new instance with projection information fn with_projection(&self, config: &FileScanConfig) -> Arc; /// Initialize new instance with projected statistics diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 4dfb6a4ec3d3..5847a8cf5e11 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -44,18 +44,20 @@ use datafusion_execution::{ object_store::ObjectStoreUrl, SendableRecordBatchStream, TaskContext, }; use datafusion_expr::Operator; -use datafusion_physical_expr::expressions::BinaryExpr; -use datafusion_physical_expr::{expressions::Column, utils::reassign_expr_columns}; +use datafusion_physical_expr::expressions::{BinaryExpr, Column}; +use datafusion_physical_expr::projection::ProjectionExprs; +use datafusion_physical_expr::utils::reassign_expr_columns; use datafusion_physical_expr::{split_conjunction, EquivalenceProperties, Partitioning}; use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use datafusion_physical_plan::projection::ProjectionExpr; +use datafusion_physical_plan::projection::{ + all_alias_free_columns, new_projections_for_columns, ProjectionExpr, +}; use datafusion_physical_plan::{ display::{display_orderings, ProjectSchemaDisplay}, filter_pushdown::FilterPushdownPropagation, metrics::ExecutionPlanMetricsSet, - projection::{all_alias_free_columns, new_projections_for_columns}, DisplayAs, DisplayFormatType, }; use std::{ @@ -87,6 +89,7 @@ use log::{debug, warn}; /// # use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; /// # use datafusion_datasource::file_stream::FileOpener; /// # use datafusion_datasource::source::DataSourceExec; +/// # use datafusion_datasource::table_schema::TableSchema; /// # use datafusion_execution::object_store::ObjectStoreUrl; /// # use datafusion_physical_plan::ExecutionPlan; /// # use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; @@ -107,7 +110,7 @@ use log::{debug, warn}; /// # fn create_file_opener(&self, _: Arc, _: &FileScanConfig, _: usize) -> Arc { unimplemented!() } /// # fn as_any(&self) -> &dyn Any { self } /// # fn with_batch_size(&self, _: usize) -> Arc { unimplemented!() } -/// # fn with_schema(&self, _: SchemaRef) -> Arc { Arc::new(self.clone()) as Arc } +/// # fn with_schema(&self, _: TableSchema) -> Arc { Arc::new(self.clone()) as Arc } /// # fn with_projection(&self, _: &FileScanConfig) -> Arc { unimplemented!() } /// # fn with_statistics(&self, statistics: Statistics) -> Arc { Arc::new(Self {projected_statistics: Some(statistics), schema_adapter_factory: self.schema_adapter_factory.clone()} ) } /// # fn metrics(&self) -> &ExecutionPlanMetricsSet { unimplemented!() } @@ -124,7 +127,7 @@ use log::{debug, warn}; /// let file_source = Arc::new(ParquetSource::new()); /// let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source) /// .with_limit(Some(1000)) // read only the first 1000 records -/// .with_projection(Some(vec![2, 3])) // project columns 2 and 3 +/// .with_projection_indices(Some(vec![2, 3])) // project columns 2 and 3 /// // Read /tmp/file1.parquet with known size of 1234 bytes in a single group /// .with_file(PartitionedFile::new("file1.parquet", 1234)) /// // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes @@ -175,9 +178,12 @@ pub struct FileScanConfig { pub file_groups: Vec, /// Table constraints pub constraints: Constraints, - /// Columns on which to project the data. Indexes that are higher than the - /// number of columns of `file_schema` refer to `table_partition_cols`. - pub projection: Option>, + /// Physical expressions defining the projection to apply when reading data. + /// + /// Each expression in the projection can reference columns from both the file + /// schema and table partition columns. If `None`, all columns from the table + /// schema are projected. + pub projection_exprs: Option, /// The maximum number of records to read from this plan. If `None`, /// all records after filtering are returned. pub limit: Option, @@ -229,7 +235,7 @@ pub struct FileScanConfig { /// // Set a limit of 1000 rows /// .with_limit(Some(1000)) /// // Project only the first column -/// .with_projection(Some(vec![0])) +/// .with_projection_indices(Some(vec![0])) /// // Add partition columns /// .with_table_partition_cols(vec![ /// Field::new("date", DataType::Utf8, false), @@ -261,7 +267,7 @@ pub struct FileScanConfigBuilder { table_schema: TableSchema, file_source: Arc, limit: Option, - projection: Option>, + projection_indices: Option>, constraints: Option, file_groups: Vec, statistics: Option, @@ -294,7 +300,7 @@ impl FileScanConfigBuilder { file_compression_type: None, new_lines_in_values: None, limit: None, - projection: None, + projection_indices: None, constraints: None, batch_size: None, expr_adapter_factory: None, @@ -317,10 +323,25 @@ impl FileScanConfigBuilder { self } + pub fn table_schema(&self) -> &SchemaRef { + self.table_schema.table_schema() + } + /// Set the columns on which to project the data. Indexes that are higher than the /// number of columns of `file_schema` refer to `table_partition_cols`. - pub fn with_projection(mut self, projection: Option>) -> Self { - self.projection = projection; + /// + /// # Deprecated + /// Use [`Self::with_projection_indices`] instead. This method will be removed in a future release. + #[deprecated(since = "51.0.0", note = "Use with_projection_indices instead")] + pub fn with_projection(self, indices: Option>) -> Self { + self.with_projection_indices(indices) + } + + /// Set the columns on which to project the data using column indices. + /// + /// Indexes that are higher than the number of columns of `file_schema` refer to `table_partition_cols`. + pub fn with_projection_indices(mut self, indices: Option>) -> Self { + self.projection_indices = indices; self } @@ -433,7 +454,7 @@ impl FileScanConfigBuilder { table_schema, file_source, limit, - projection, + projection_indices, constraints, file_groups, statistics, @@ -450,17 +471,23 @@ impl FileScanConfigBuilder { let file_source = file_source .with_statistics(statistics.clone()) - .with_schema(Arc::clone(table_schema.file_schema())); + .with_schema(table_schema.clone()); let file_compression_type = file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED); let new_lines_in_values = new_lines_in_values.unwrap_or(false); + // Convert projection indices to ProjectionExprs using the final table schema + // (which now includes partition columns if they were added) + let projection_exprs = projection_indices.map(|indices| { + ProjectionExprs::from_indices(&indices, table_schema.table_schema()) + }); + FileScanConfig { object_store_url, table_schema, file_source, limit, - projection, + projection_exprs, constraints, file_groups, output_ordering, @@ -484,7 +511,9 @@ impl From for FileScanConfigBuilder { file_compression_type: Some(config.file_compression_type), new_lines_in_values: Some(config.new_lines_in_values), limit: config.limit, - projection: config.projection, + projection_indices: config + .projection_exprs + .map(|p| p.ordered_column_indices()), constraints: Some(config.constraints), batch_size: config.batch_size, expr_adapter_factory: config.expr_adapter_factory, @@ -598,8 +627,39 @@ impl DataSource for FileScanConfig { SchedulingType::Cooperative } - fn statistics(&self) -> Result { - Ok(self.projected_stats()) + fn partition_statistics(&self, partition: Option) -> Result { + if let Some(partition) = partition { + // Get statistics for a specific partition + if let Some(file_group) = self.file_groups.get(partition) { + if let Some(stat) = file_group.file_statistics(None) { + // Project the statistics based on the projection + let table_cols_stats = self + .projection_indices() + .into_iter() + .map(|idx| { + if idx < self.file_schema().fields().len() { + stat.column_statistics[idx].clone() + } else { + // TODO provide accurate stat for partition column + // See https://github.com/apache/datafusion/issues/1186 + ColumnStatistics::new_unknown() + } + }) + .collect(); + + return Ok(Statistics { + num_rows: stat.num_rows, + total_byte_size: stat.total_byte_size, + column_statistics: table_cols_stats, + }); + } + } + // If no statistics available for this partition, return unknown + Ok(Statistics::new_unknown(&self.projected_schema())) + } else { + // Return aggregate statistics across all partitions + Ok(self.projected_stats()) + } } fn with_fetch(&self, limit: Option) -> Option> { @@ -642,15 +702,16 @@ impl DataSource for FileScanConfig { let new_projections = new_projections_for_columns( projection, &file_scan - .projection - .clone() + .projection_exprs + .as_ref() + .map(|p| p.ordered_column_indices()) .unwrap_or_else(|| (0..self.file_schema().fields().len()).collect()), ); Arc::new( FileScanConfigBuilder::from(file_scan) // Assign projected statistics to source - .with_projection(Some(new_projections)) + .with_projection_indices(Some(new_projections)) .with_source(source) .build(), ) as _ @@ -696,8 +757,8 @@ impl FileScanConfig { } fn projection_indices(&self) -> Vec { - match &self.projection { - Some(proj) => proj.clone(), + match &self.projection_exprs { + Some(proj) => proj.ordered_column_indices(), None => (0..self.file_schema().fields().len() + self.table_partition_cols().len()) .collect(), @@ -794,7 +855,7 @@ impl FileScanConfig { /// Project the schema, constraints, and the statistics on the given column indices pub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec) { - if self.projection.is_none() && self.table_partition_cols().is_empty() { + if self.projection_exprs.is_none() && self.table_partition_cols().is_empty() { return ( Arc::clone(self.file_schema()), self.constraints.clone(), @@ -813,12 +874,17 @@ impl FileScanConfig { } pub fn projected_file_column_names(&self) -> Option> { - self.projection.as_ref().map(|p| { - p.iter() - .filter(|col_idx| **col_idx < self.file_schema().fields().len()) - .map(|col_idx| self.file_schema().field(*col_idx).name()) + let fields = self.file_schema().fields(); + + self.projection_exprs.as_ref().map(|p| { + let column_indices = p.ordered_column_indices(); + + column_indices + .iter() + .filter(|&&col_i| col_i < fields.len()) + .map(|&col_i| self.file_schema().field(col_i).name()) .cloned() - .collect() + .collect::>() }) } @@ -844,11 +910,11 @@ impl FileScanConfig { } pub fn file_column_projection_indices(&self) -> Option> { - self.projection.as_ref().map(|p| { - p.iter() - .filter(|col_idx| **col_idx < self.file_schema().fields().len()) - .copied() - .collect() + self.projection_exprs.as_ref().map(|p| { + p.ordered_column_indices() + .into_iter() + .filter(|&i| i < self.file_schema().fields().len()) + .collect::>() }) } @@ -1323,25 +1389,25 @@ fn create_output_array( /// correctly sorted on `(A, B, C)` /// /// ```text -///┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┓ -/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ┐ -///┃ ┌───────────────┐ ┌──────────────┐ │ ┌──────────────┐ │ ┌─────────────┐ ┃ -/// │ │ 1.parquet │ │ │ │ 2.parquet │ │ │ 3.parquet │ │ │ 4.parquet │ │ -///┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ │Sort: A, B, C │ │ │Sort: A, B, C│ ┃ -/// │ └───────────────┘ │ │ └──────────────┘ │ └──────────────┘ │ └─────────────┘ │ -///┃ │ │ ┃ -/// │ │ │ │ │ │ -///┃ │ │ ┃ -/// │ │ │ │ │ │ -///┃ │ │ ┃ -/// │ │ │ │ │ │ -///┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ -/// DataFusion DataFusion DataFusion DataFusion -///┃ Partition 1 Partition 2 Partition 3 Partition 4 ┃ -/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ +/// ┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┓ +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ┐ +/// ┃ ┌───────────────┐ ┌──────────────┐ │ ┌──────────────┐ │ ┌─────────────┐ ┃ +/// │ │ 1.parquet │ │ │ │ 2.parquet │ │ │ 3.parquet │ │ │ 4.parquet │ │ +/// ┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ │Sort: A, B, C │ │ │Sort: A, B, C│ ┃ +/// │ └───────────────┘ │ │ └──────────────┘ │ └──────────────┘ │ └─────────────┘ │ +/// ┃ │ │ ┃ +/// │ │ │ │ │ │ +/// ┃ │ │ ┃ +/// │ │ │ │ │ │ +/// ┃ │ │ ┃ +/// │ │ │ │ │ │ +/// ┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ +/// DataFusion DataFusion DataFusion DataFusion +/// ┃ Partition 1 Partition 2 Partition 3 Partition 4 ┃ +/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ /// /// DataSourceExec -///``` +/// ``` /// /// However, when more than 1 file is assigned to each partition, each /// partition is NOT correctly sorted on `(A, B, C)`. Once the second @@ -1349,25 +1415,25 @@ fn create_output_array( /// the same sorted stream /// ///```text -///┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ -/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ -///┃ ┌───────────────┐ ┌──────────────┐ │ -/// │ │ 1.parquet │ │ │ │ 2.parquet │ ┃ -///┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ -/// │ └───────────────┘ │ │ └──────────────┘ ┃ -///┃ ┌───────────────┐ ┌──────────────┐ │ -/// │ │ 3.parquet │ │ │ │ 4.parquet │ ┃ -///┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ -/// │ └───────────────┘ │ │ └──────────────┘ ┃ -///┃ │ -/// │ │ │ ┃ -///┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ -/// DataFusion DataFusion ┃ -///┃ Partition 1 Partition 2 -/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┛ +/// ┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ +/// ┃ ┌───────────────┐ ┌──────────────┐ │ +/// │ │ 1.parquet │ │ │ │ 2.parquet │ ┃ +/// ┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ +/// │ └───────────────┘ │ │ └──────────────┘ ┃ +/// ┃ ┌───────────────┐ ┌──────────────┐ │ +/// │ │ 3.parquet │ │ │ │ 4.parquet │ ┃ +/// ┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ +/// │ └───────────────┘ │ │ └──────────────┘ ┃ +/// ┃ │ +/// │ │ │ ┃ +/// ┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ +/// DataFusion DataFusion ┃ +/// ┃ Partition 1 Partition 2 +/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┛ /// /// DataSourceExec -///``` +/// ``` fn get_projected_output_ordering( base_config: &FileScanConfig, projected_schema: &SchemaRef, @@ -1384,10 +1450,15 @@ fn get_projected_output_ordering( return false; } + let indices = base_config + .projection_exprs + .as_ref() + .map(|p| p.ordered_column_indices()); + let statistics = match MinMaxStatistics::new_from_files( &new_ordering, projected_schema, - base_config.projection.as_deref(), + indices.as_deref(), group.iter(), ) { Ok(statistics) => statistics, @@ -1448,7 +1519,7 @@ mod tests { use datafusion_common::{assert_batches_eq, internal_err}; use datafusion_expr::{Operator, SortExpr}; use datafusion_physical_expr::create_physical_sort_expr; - use datafusion_physical_expr::expressions::{BinaryExpr, Literal}; + use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal}; use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr; /// Returns the column names on the schema @@ -1603,7 +1674,7 @@ mod tests { ); let source_statistics = conf.file_source.statistics().unwrap(); - let conf_stats = conf.statistics().unwrap(); + let conf_stats = conf.partition_statistics(None).unwrap(); // projection should be reflected in the file source statistics assert_eq!(conf_stats.num_rows, Precision::Inexact(3)); @@ -2112,7 +2183,7 @@ mod tests { file_schema, Arc::new(MockSource::default()), ) - .with_projection(projection) + .with_projection_indices(projection) .with_statistics(statistics) .with_table_partition_cols(table_partition_cols) .build() @@ -2165,7 +2236,7 @@ mod tests { // Build with various configurations let config = builder .with_limit(Some(1000)) - .with_projection(Some(vec![0, 1])) + .with_projection_indices(Some(vec![0, 1])) .with_table_partition_cols(vec![Field::new( "date", wrap_partition_type_in_dict(DataType::Utf8), @@ -2188,7 +2259,10 @@ mod tests { assert_eq!(config.object_store_url, object_store_url); assert_eq!(*config.file_schema(), file_schema); assert_eq!(config.limit, Some(1000)); - assert_eq!(config.projection, Some(vec![0, 1])); + assert_eq!( + config.projection_exprs.as_ref().map(|p| p.column_indices()), + Some(vec![0, 1]) + ); assert_eq!(config.table_partition_cols().len(), 1); assert_eq!(config.table_partition_cols()[0].name(), "date"); assert_eq!(config.file_groups.len(), 1); @@ -2222,7 +2296,7 @@ mod tests { Arc::clone(&file_schema), Arc::clone(&file_source), ) - .with_projection(Some(vec![0, 1, 2])) + .with_projection_indices(Some(vec![0, 1, 2])) .build(); // Simulate projection being updated. Since the filter has already been pushed down, @@ -2271,7 +2345,10 @@ mod tests { assert_eq!(config.object_store_url, object_store_url); assert_eq!(*config.file_schema(), file_schema); assert_eq!(config.limit, None); - assert_eq!(config.projection, None); + assert_eq!( + config.projection_exprs.as_ref().map(|p| p.column_indices()), + None + ); assert!(config.table_partition_cols().is_empty()); assert!(config.file_groups.is_empty()); assert_eq!( @@ -2326,7 +2403,7 @@ mod tests { Arc::clone(&schema), Arc::clone(&file_source), ) - .with_projection(Some(vec![0, 2])) + .with_projection_indices(Some(vec![0, 2])) .with_limit(Some(10)) .with_table_partition_cols(partition_cols.clone()) .with_file(file.clone()) @@ -2344,7 +2421,13 @@ mod tests { let partition_cols = partition_cols.into_iter().map(Arc::new).collect::>(); assert_eq!(new_config.object_store_url, object_store_url); assert_eq!(*new_config.file_schema(), schema); - assert_eq!(new_config.projection, Some(vec![0, 2])); + assert_eq!( + new_config + .projection_exprs + .as_ref() + .map(|p| p.column_indices()), + Some(vec![0, 2]) + ); assert_eq!(new_config.limit, Some(10)); assert_eq!(*new_config.table_partition_cols(), partition_cols); assert_eq!(new_config.file_groups.len(), 1); @@ -2510,4 +2593,91 @@ mod tests { Ok(()) } + + #[test] + fn test_partition_statistics_projection() { + // This test verifies that partition_statistics applies projection correctly. + // The old implementation had a bug where it returned file group statistics + // without applying the projection, returning all column statistics instead + // of just the projected ones. + + use crate::source::DataSourceExec; + use datafusion_physical_plan::ExecutionPlan; + + // Create a schema with 4 columns + let schema = Arc::new(Schema::new(vec![ + Field::new("col0", DataType::Int32, false), + Field::new("col1", DataType::Int32, false), + Field::new("col2", DataType::Int32, false), + Field::new("col3", DataType::Int32, false), + ])); + + // Create statistics for all 4 columns + let file_group_stats = Statistics { + num_rows: Precision::Exact(100), + total_byte_size: Precision::Exact(1024), + column_statistics: vec![ + ColumnStatistics { + null_count: Precision::Exact(0), + ..ColumnStatistics::new_unknown() + }, + ColumnStatistics { + null_count: Precision::Exact(5), + ..ColumnStatistics::new_unknown() + }, + ColumnStatistics { + null_count: Precision::Exact(10), + ..ColumnStatistics::new_unknown() + }, + ColumnStatistics { + null_count: Precision::Exact(15), + ..ColumnStatistics::new_unknown() + }, + ], + }; + + // Create a file group with statistics + let file_group = FileGroup::new(vec![PartitionedFile::new("test.parquet", 1024)]) + .with_statistics(Arc::new(file_group_stats)); + + // Create a FileScanConfig with projection: only keep columns 0 and 2 + let config = FileScanConfigBuilder::new( + ObjectStoreUrl::parse("test:///").unwrap(), + Arc::clone(&schema), + Arc::new(MockSource::default()), + ) + .with_projection_indices(Some(vec![0, 2])) // Only project columns 0 and 2 + .with_file_groups(vec![file_group]) + .build(); + + // Create a DataSourceExec from the config + let exec = DataSourceExec::from_data_source(config); + + // Get statistics for partition 0 + let partition_stats = exec.partition_statistics(Some(0)).unwrap(); + + // Verify that only 2 columns are in the statistics (the projected ones) + assert_eq!( + partition_stats.column_statistics.len(), + 2, + "Expected 2 column statistics (projected), but got {}", + partition_stats.column_statistics.len() + ); + + // Verify the column statistics are for columns 0 and 2 + assert_eq!( + partition_stats.column_statistics[0].null_count, + Precision::Exact(0), + "First projected column should be col0 with 0 nulls" + ); + assert_eq!( + partition_stats.column_statistics[1].null_count, + Precision::Exact(10), + "Second projected column should be col2 with 10 nulls" + ); + + // Verify row count and byte size are preserved + assert_eq!(partition_stats.num_rows, Precision::Exact(100)); + assert_eq!(partition_stats.total_byte_size, Precision::Exact(1024)); + } } diff --git a/datafusion/datasource/src/memory.rs b/datafusion/datasource/src/memory.rs index eb55aa9b0b0d..7d5c8c4834ea 100644 --- a/datafusion/datasource/src/memory.rs +++ b/datafusion/datasource/src/memory.rs @@ -21,6 +21,7 @@ use std::collections::BinaryHeap; use std::fmt; use std::fmt::Debug; use std::ops::Deref; +use std::slice::from_ref; use std::sync::Arc; use crate::sink::DataSink; @@ -192,12 +193,27 @@ impl DataSource for MemorySourceConfig { SchedulingType::Cooperative } - fn statistics(&self) -> Result { - Ok(common::compute_record_batch_statistics( - &self.partitions, - &self.schema, - self.projection.clone(), - )) + fn partition_statistics(&self, partition: Option) -> Result { + if let Some(partition) = partition { + // Compute statistics for a specific partition + if let Some(batches) = self.partitions.get(partition) { + Ok(common::compute_record_batch_statistics( + from_ref(batches), + &self.schema, + self.projection.clone(), + )) + } else { + // Invalid partition index + Ok(Statistics::new_unknown(&self.projected_schema)) + } + } else { + // Compute statistics across all partitions + Ok(common::compute_record_batch_statistics( + &self.partitions, + &self.schema, + self.projection.clone(), + )) + } } fn with_fetch(&self, limit: Option) -> Option> { diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs index 80b44ad5949a..8d988bdb31be 100644 --- a/datafusion/datasource/src/mod.rs +++ b/datafusion/datasource/src/mod.rs @@ -310,7 +310,6 @@ pub async fn calculate_range( /// Returns a `Result` wrapping a `usize` that represents the position of the first newline character found within the specified range. If no newline is found, it returns the length of the scanned data, effectively indicating the end of the range. /// /// The function returns an `Error` if any issues arise while reading from the object store or processing the data stream. -/// async fn find_first_newline( object_store: &Arc, location: &Path, diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index 20d9a1d6e53f..11a8a3867b80 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -151,7 +151,21 @@ pub trait DataSource: Send + Sync + Debug { fn scheduling_type(&self) -> SchedulingType { SchedulingType::NonCooperative } - fn statistics(&self) -> Result; + + /// Returns statistics for a specific partition, or aggregate statistics + /// across all partitions if `partition` is `None`. + fn partition_statistics(&self, partition: Option) -> Result; + + /// Returns aggregate statistics across all partitions. + /// + /// # Deprecated + /// Use [`Self::partition_statistics`] instead, which provides more fine-grained + /// control over statistics retrieval (per-partition or aggregate). + #[deprecated(since = "51.0.0", note = "Use partition_statistics instead")] + fn statistics(&self) -> Result { + self.partition_statistics(None) + } + /// Return a copy of this DataSource with a new fetch limit fn with_fetch(&self, _limit: Option) -> Option>; fn fetch(&self) -> Option; @@ -285,21 +299,7 @@ impl ExecutionPlan for DataSourceExec { } fn partition_statistics(&self, partition: Option) -> Result { - if let Some(partition) = partition { - let mut statistics = Statistics::new_unknown(&self.schema()); - if let Some(file_config) = - self.data_source.as_any().downcast_ref::() - { - if let Some(file_group) = file_config.file_groups.get(partition) { - if let Some(stat) = file_group.file_statistics(None) { - statistics = stat.clone(); - } - } - } - Ok(statistics) - } else { - Ok(self.data_source.statistics()?) - } + self.data_source.partition_statistics(partition) } fn with_fetch(&self, limit: Option) -> Option> { diff --git a/datafusion/datasource/src/table_schema.rs b/datafusion/datasource/src/table_schema.rs index 8e95585ce873..8002df4a99df 100644 --- a/datafusion/datasource/src/table_schema.rs +++ b/datafusion/datasource/src/table_schema.rs @@ -85,6 +85,11 @@ impl TableSchema { /// The table schema is automatically computed by appending the partition columns /// to the file schema. /// + /// You should prefer calling this method over + /// chaining [`TableSchema::from_file_schema`] and [`TableSchema::with_table_partition_cols`] + /// if you have both the file schema and partition columns available at construction time + /// since it avoids re-computing the table schema. + /// /// # Arguments /// /// * `file_schema` - Schema of the data files (without partition columns) @@ -121,17 +126,24 @@ impl TableSchema { } } - /// Create a new TableSchema from a file schema with no partition columns. + /// Create a new TableSchema with no partition columns. + /// + /// You should prefer calling [`TableSchema::new`] if you have partition columns at + /// construction time since it avoids re-computing the table schema. pub fn from_file_schema(file_schema: SchemaRef) -> Self { Self::new(file_schema, vec![]) } - /// Set the table partition columns and rebuild the table schema. - pub fn with_table_partition_cols( - mut self, - table_partition_cols: Vec, - ) -> TableSchema { - self.table_partition_cols = table_partition_cols; + /// Add partition columns to an existing TableSchema, returning a new instance. + /// + /// You should prefer calling [`TableSchema::new`] instead of chaining [`TableSchema::from_file_schema`] + /// into [`TableSchema::with_table_partition_cols`] if you have partition columns at construction time + /// since it avoids re-computing the table schema. + pub fn with_table_partition_cols(mut self, partition_cols: Vec) -> Self { + self.table_partition_cols = partition_cols; + let mut builder = SchemaBuilder::from(self.file_schema.as_ref()); + builder.extend(self.table_partition_cols.iter().cloned()); + self.table_schema = Arc::new(builder.finish()); self } diff --git a/datafusion/datasource/src/test_util.rs b/datafusion/datasource/src/test_util.rs index f0aff1fa62b7..feb704af9913 100644 --- a/datafusion/datasource/src/test_util.rs +++ b/datafusion/datasource/src/test_util.rs @@ -22,7 +22,8 @@ use crate::{ use std::sync::Arc; -use arrow::datatypes::{Schema, SchemaRef}; +use crate::TableSchema; +use arrow::datatypes::Schema; use datafusion_common::{Result, Statistics}; use datafusion_physical_expr::{expressions::Column, PhysicalExpr}; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; @@ -66,7 +67,7 @@ impl FileSource for MockSource { Arc::new(Self { ..self.clone() }) } - fn with_schema(&self, _schema: SchemaRef) -> Arc { + fn with_schema(&self, _schema: TableSchema) -> Arc { Arc::new(Self { ..self.clone() }) } diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs index c87b307c5fb8..08e5b6a5df83 100644 --- a/datafusion/datasource/src/url.rs +++ b/datafusion/datasource/src/url.rs @@ -252,7 +252,10 @@ impl ListingTableUrl { .boxed(), // If the head command fails, it is likely that object doesn't exist. // Retry as though it were a prefix (aka a collection) - Err(_) => list_with_cache(ctx, store, &self.prefix).await?, + Err(object_store::Error::NotFound { .. }) => { + list_with_cache(ctx, store, &self.prefix).await? + } + Err(e) => return Err(e.into()), } }; @@ -382,7 +385,6 @@ const GLOB_START_CHARS: [char; 3] = ['?', '*', '[']; /// /// Path delimiters are determined using [`std::path::is_separator`] which /// permits `/` as a path delimiter even on Windows platforms. -/// #[cfg(not(target_arch = "wasm32"))] fn split_glob_expression(path: &str) -> Option<(&str, &str)> { let mut last_separator = 0; @@ -405,6 +407,8 @@ fn split_glob_expression(path: &str) -> Option<(&str, &str)> { #[cfg(test)] mod tests { use super::*; + use async_trait::async_trait; + use bytes::Bytes; use datafusion_common::config::TableOptions; use datafusion_common::DFSchema; use datafusion_execution::config::SessionConfig; @@ -414,9 +418,13 @@ mod tests { use datafusion_expr::{AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF}; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_plan::ExecutionPlan; - use object_store::PutPayload; + use object_store::{ + GetOptions, GetResult, ListResult, MultipartUpload, PutMultipartOptions, + PutPayload, + }; use std::any::Any; use std::collections::HashMap; + use std::ops::Range; use tempfile::tempdir; #[test] @@ -632,48 +640,68 @@ mod tests { } #[tokio::test] - async fn test_list_files() { - let store = object_store::memory::InMemory::new(); + async fn test_list_files() -> Result<()> { + let store = MockObjectStore { + in_mem: object_store::memory::InMemory::new(), + forbidden_paths: vec!["forbidden/e.parquet".into()], + }; + // Create some files: create_file(&store, "a.parquet").await; create_file(&store, "/t/b.parquet").await; create_file(&store, "/t/c.csv").await; create_file(&store, "/t/d.csv").await; + // This file returns a permission error. + create_file(&store, "/forbidden/e.parquet").await; + assert_eq!( - list_all_files("/", &store, "parquet").await, + list_all_files("/", &store, "parquet").await?, vec!["a.parquet"], ); // test with and without trailing slash assert_eq!( - list_all_files("/t/", &store, "parquet").await, + list_all_files("/t/", &store, "parquet").await?, vec!["t/b.parquet"], ); assert_eq!( - list_all_files("/t", &store, "parquet").await, + list_all_files("/t", &store, "parquet").await?, vec!["t/b.parquet"], ); // test with and without trailing slash assert_eq!( - list_all_files("/t", &store, "csv").await, + list_all_files("/t", &store, "csv").await?, vec!["t/c.csv", "t/d.csv"], ); assert_eq!( - list_all_files("/t/", &store, "csv").await, + list_all_files("/t/", &store, "csv").await?, vec!["t/c.csv", "t/d.csv"], ); // Test a non existing prefix assert_eq!( - list_all_files("/NonExisting", &store, "csv").await, + list_all_files("/NonExisting", &store, "csv").await?, vec![] as Vec ); assert_eq!( - list_all_files("/NonExisting/", &store, "csv").await, + list_all_files("/NonExisting/", &store, "csv").await?, vec![] as Vec ); + + // Including forbidden.parquet generates an error. + let Err(DataFusionError::ObjectStore(err)) = + list_all_files("/forbidden/e.parquet", &store, "parquet").await + else { + panic!("Expected ObjectStore error"); + }; + + let object_store::Error::PermissionDenied { .. } = &*err else { + panic!("Expected PermissionDenied error"); + }; + + Ok(()) } /// Creates a file with "hello world" content at the specified path @@ -691,10 +719,8 @@ mod tests { url: &str, store: &dyn ObjectStore, file_extension: &str, - ) -> Vec { - try_list_all_files(url, store, file_extension) - .await - .unwrap() + ) -> Result> { + try_list_all_files(url, store, file_extension).await } /// Runs "list_all_files" and returns their paths @@ -716,6 +742,95 @@ mod tests { Ok(files) } + #[derive(Debug)] + struct MockObjectStore { + in_mem: object_store::memory::InMemory, + forbidden_paths: Vec, + } + + impl std::fmt::Display for MockObjectStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.in_mem.fmt(f) + } + } + + #[async_trait] + impl ObjectStore for MockObjectStore { + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: object_store::PutOptions, + ) -> object_store::Result { + self.in_mem.put_opts(location, payload, opts).await + } + + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOptions, + ) -> object_store::Result> { + self.in_mem.put_multipart_opts(location, opts).await + } + + async fn get_opts( + &self, + location: &Path, + options: GetOptions, + ) -> object_store::Result { + self.in_mem.get_opts(location, options).await + } + + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> object_store::Result> { + self.in_mem.get_ranges(location, ranges).await + } + + async fn head(&self, location: &Path) -> object_store::Result { + if self.forbidden_paths.contains(location) { + Err(object_store::Error::PermissionDenied { + path: location.to_string(), + source: "forbidden".into(), + }) + } else { + self.in_mem.head(location).await + } + } + + async fn delete(&self, location: &Path) -> object_store::Result<()> { + self.in_mem.delete(location).await + } + + fn list( + &self, + prefix: Option<&Path>, + ) -> BoxStream<'static, object_store::Result> { + self.in_mem.list(prefix) + } + + async fn list_with_delimiter( + &self, + prefix: Option<&Path>, + ) -> object_store::Result { + self.in_mem.list_with_delimiter(prefix).await + } + + async fn copy(&self, from: &Path, to: &Path) -> object_store::Result<()> { + self.in_mem.copy(from, to).await + } + + async fn copy_if_not_exists( + &self, + from: &Path, + to: &Path, + ) -> object_store::Result<()> { + self.in_mem.copy_if_not_exists(from, to).await + } + } + struct MockSession { config: SessionConfig, runtime_env: Arc, diff --git a/datafusion/datasource/src/write/mod.rs b/datafusion/datasource/src/write/mod.rs index 3694568682a5..85832f81bc18 100644 --- a/datafusion/datasource/src/write/mod.rs +++ b/datafusion/datasource/src/write/mod.rs @@ -162,7 +162,11 @@ impl ObjectWriterBuilder { /// # let object_store = Arc::new(InMemory::new()); /// let mut builder = ObjectWriterBuilder::new(compression_type, &location, object_store); /// builder.set_buffer_size(Some(20 * 1024 * 1024)); //20 MiB - /// assert_eq!(builder.get_buffer_size(), Some(20 * 1024 * 1024), "Internal error: Builder buffer size doesn't match"); + /// assert_eq!( + /// builder.get_buffer_size(), + /// Some(20 * 1024 * 1024), + /// "Internal error: Builder buffer size doesn't match" + /// ); /// ``` pub fn set_buffer_size(&mut self, buffer_size: Option) { self.buffer_size = buffer_size; @@ -182,7 +186,11 @@ impl ObjectWriterBuilder { /// # let object_store = Arc::new(InMemory::new()); /// let builder = ObjectWriterBuilder::new(compression_type, &location, object_store) /// .with_buffer_size(Some(20 * 1024 * 1024)); //20 MiB - /// assert_eq!(builder.get_buffer_size(), Some(20 * 1024 * 1024), "Internal error: Builder buffer size doesn't match"); + /// assert_eq!( + /// builder.get_buffer_size(), + /// Some(20 * 1024 * 1024), + /// "Internal error: Builder buffer size doesn't match" + /// ); /// ``` pub fn with_buffer_size(mut self, buffer_size: Option) -> Self { self.buffer_size = buffer_size; diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs index 491b1aca69ea..a0b180bf4020 100644 --- a/datafusion/execution/src/config.rs +++ b/datafusion/execution/src/config.rs @@ -44,12 +44,15 @@ use datafusion_common::{ /// shorthand for setting `datafusion.execution.batch_size`. /// /// ``` -/// use datafusion_execution::config::SessionConfig; /// use datafusion_common::ScalarValue; +/// use datafusion_execution::config::SessionConfig; /// /// let config = SessionConfig::new() -/// .set("datafusion.execution.batch_size", &ScalarValue::UInt64(Some(1234))) -/// .set_bool("datafusion.execution.parquet.pushdown_filters", true); +/// .set( +/// "datafusion.execution.batch_size", +/// &ScalarValue::UInt64(Some(1234)), +/// ) +/// .set_bool("datafusion.execution.parquet.pushdown_filters", true); /// /// assert_eq!(config.batch_size(), 1234); /// assert_eq!(config.options().execution.batch_size, 1234); @@ -502,8 +505,8 @@ impl SessionConfig { /// /// # Example /// ``` - /// use std::sync::Arc; /// use datafusion_execution::config::SessionConfig; + /// use std::sync::Arc; /// /// // application-specific extension types /// struct Ext1(u8); @@ -545,8 +548,8 @@ impl SessionConfig { /// /// # Example /// ``` - /// use std::sync::Arc; /// use datafusion_execution::config::SessionConfig; + /// use std::sync::Arc; /// /// // application-specific extension types /// struct Ext1(u8); diff --git a/datafusion/execution/src/memory_pool/pool.rs b/datafusion/execution/src/memory_pool/pool.rs index 306df3defdbb..d6b55182aa6b 100644 --- a/datafusion/execution/src/memory_pool/pool.rs +++ b/datafusion/execution/src/memory_pool/pool.rs @@ -346,8 +346,10 @@ impl TrackConsumersPool { /// # Example /// /// ```rust + /// use datafusion_execution::memory_pool::{ + /// FairSpillPool, GreedyMemoryPool, TrackConsumersPool, + /// }; /// use std::num::NonZeroUsize; - /// use datafusion_execution::memory_pool::{TrackConsumersPool, GreedyMemoryPool, FairSpillPool}; /// /// // Create with a greedy pool backend, reporting top 3 consumers in error messages /// let tracked_greedy = TrackConsumersPool::new( diff --git a/datafusion/execution/src/parquet_encryption.rs b/datafusion/execution/src/parquet_encryption.rs index 73881e11ca72..027421e08f54 100644 --- a/datafusion/execution/src/parquet_encryption.rs +++ b/datafusion/execution/src/parquet_encryption.rs @@ -41,14 +41,14 @@ pub trait EncryptionFactory: Send + Sync + std::fmt::Debug + 'static { config: &EncryptionFactoryOptions, schema: &SchemaRef, file_path: &Path, - ) -> Result>; + ) -> Result>>; /// Generate file decryption properties to use when reading a Parquet file. async fn get_file_decryption_properties( &self, config: &EncryptionFactoryOptions, file_path: &Path, - ) -> Result>; + ) -> Result>>; } /// Stores [`EncryptionFactory`] implementations that can be retrieved by a unique string identifier diff --git a/datafusion/execution/src/runtime_env.rs b/datafusion/execution/src/runtime_env.rs index b0d0a966b7a2..d69987600855 100644 --- a/datafusion/execution/src/runtime_env.rs +++ b/datafusion/execution/src/runtime_env.rs @@ -67,9 +67,9 @@ use url::Url; /// // restrict to using at most 100MB of memory /// let pool_size = 100 * 1024 * 1024; /// let runtime_env = RuntimeEnvBuilder::new() -/// .with_memory_pool(Arc::new(GreedyMemoryPool::new(pool_size))) -/// .build() -/// .unwrap(); +/// .with_memory_pool(Arc::new(GreedyMemoryPool::new(pool_size))) +/// .build() +/// .unwrap(); /// ``` pub struct RuntimeEnv { /// Runtime memory management diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index b5b632076b00..40c44cfb3ca2 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -1670,22 +1670,23 @@ fn cast_scalar_value( /// /// // [1, 2) U {NULL} /// let maybe_null = NullableInterval::MaybeNull { -/// values: Interval::try_new( -/// ScalarValue::Int32(Some(1)), -/// ScalarValue::Int32(Some(2)), -/// ).unwrap(), +/// values: Interval::try_new( +/// ScalarValue::Int32(Some(1)), +/// ScalarValue::Int32(Some(2)), +/// ) +/// .unwrap(), /// }; /// /// // (0, ∞) /// let not_null = NullableInterval::NotNull { -/// values: Interval::try_new( -/// ScalarValue::Int32(Some(0)), -/// ScalarValue::Int32(None), -/// ).unwrap(), +/// values: Interval::try_new(ScalarValue::Int32(Some(0)), ScalarValue::Int32(None)) +/// .unwrap(), /// }; /// /// // {NULL} -/// let null_interval = NullableInterval::Null { datatype: DataType::Int32 }; +/// let null_interval = NullableInterval::Null { +/// datatype: DataType::Int32, +/// }; /// /// // {4} /// let single_value = NullableInterval::from(ScalarValue::Int32(Some(4))); @@ -1787,22 +1788,26 @@ impl NullableInterval { /// /// ``` /// use datafusion_common::ScalarValue; - /// use datafusion_expr_common::operator::Operator; /// use datafusion_expr_common::interval_arithmetic::Interval; /// use datafusion_expr_common::interval_arithmetic::NullableInterval; + /// use datafusion_expr_common::operator::Operator; /// /// // 4 > 3 -> true /// let lhs = NullableInterval::from(ScalarValue::Int32(Some(4))); /// let rhs = NullableInterval::from(ScalarValue::Int32(Some(3))); /// let result = lhs.apply_operator(&Operator::Gt, &rhs).unwrap(); - /// assert_eq!(result, NullableInterval::from(ScalarValue::Boolean(Some(true)))); + /// assert_eq!( + /// result, + /// NullableInterval::from(ScalarValue::Boolean(Some(true))) + /// ); /// /// // [1, 3) > NULL -> NULL /// let lhs = NullableInterval::NotNull { /// values: Interval::try_new( - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(Some(3)), - /// ).unwrap(), + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(Some(3)), + /// ) + /// .unwrap(), /// }; /// let rhs = NullableInterval::from(ScalarValue::Int32(None)); /// let result = lhs.apply_operator(&Operator::Gt, &rhs).unwrap(); @@ -1811,22 +1816,27 @@ impl NullableInterval { /// // [1, 3] > [2, 4] -> [false, true] /// let lhs = NullableInterval::NotNull { /// values: Interval::try_new( - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(Some(3)), - /// ).unwrap(), + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(Some(3)), + /// ) + /// .unwrap(), /// }; /// let rhs = NullableInterval::NotNull { - /// values: Interval::try_new( - /// ScalarValue::Int32(Some(2)), - /// ScalarValue::Int32(Some(4)), - /// ).unwrap(), + /// values: Interval::try_new( + /// ScalarValue::Int32(Some(2)), + /// ScalarValue::Int32(Some(4)), + /// ) + /// .unwrap(), /// }; /// let result = lhs.apply_operator(&Operator::Gt, &rhs).unwrap(); /// // Both inputs are valid (non-null), so result must be non-null - /// assert_eq!(result, NullableInterval::NotNull { - /// // Uncertain whether inequality is true or false - /// values: Interval::UNCERTAIN, - /// }); + /// assert_eq!( + /// result, + /// NullableInterval::NotNull { + /// // Uncertain whether inequality is true or false + /// values: Interval::UNCERTAIN, + /// } + /// ); /// ``` pub fn apply_operator(&self, op: &Operator, rhs: &Self) -> Result { match op { @@ -1924,7 +1934,8 @@ impl NullableInterval { /// values: Interval::try_new( /// ScalarValue::Int32(Some(1)), /// ScalarValue::Int32(Some(4)), - /// ).unwrap(), + /// ) + /// .unwrap(), /// }; /// assert_eq!(interval.single_value(), None); /// ``` diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 5fd4518e2e57..5cb7a17ee312 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -22,9 +22,9 @@ use std::hash::Hash; use crate::type_coercion::aggregates::NUMERICS; use arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; -use datafusion_common::internal_err; use datafusion_common::types::{LogicalType, LogicalTypeRef, NativeType}; use datafusion_common::utils::ListCoercion; +use datafusion_common::{internal_err, plan_err, Result}; use indexmap::IndexSet; use itertools::Itertools; @@ -84,6 +84,15 @@ pub enum Volatility { Volatile, } +/// Represents the arity (number of arguments) of a function signature +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Arity { + /// Fixed number of arguments + Fixed(usize), + /// Variable number of arguments (e.g., Variadic, VariadicAny, UserDefined) + Variable, +} + /// The types of arguments for which a function has implementations. /// /// [`TypeSignature`] **DOES NOT** define the types that a user query could call the @@ -118,11 +127,10 @@ pub enum Volatility { /// ``` /// # use arrow::datatypes::DataType; /// # use datafusion_expr_common::signature::{TypeSignature}; -/// // Declares the function must be invoked with a single argument of type `Utf8View`. -/// // if a user calls the function with `Utf8` or `LargeUtf8`, DataFusion will -/// // automatically add a cast to `Utf8View` during planning. -/// let type_signature = TypeSignature::Exact(vec![DataType::Utf8View]); -/// +/// // Declares the function must be invoked with a single argument of type `Utf8View`. +/// // if a user calls the function with `Utf8` or `LargeUtf8`, DataFusion will +/// // automatically add a cast to `Utf8View` during planning. +/// let type_signature = TypeSignature::Exact(vec![DataType::Utf8View]); /// ``` /// /// # Example: Timestamps @@ -135,11 +143,11 @@ pub enum Volatility { /// # use arrow::datatypes::{DataType, TimeUnit}; /// # use datafusion_expr_common::signature::{TIMEZONE_WILDCARD, TypeSignature}; /// let type_signature = TypeSignature::Exact(vec![ -/// // A nanosecond precision timestamp with ANY timezone -/// // matches Timestamp(Nanosecond, Some("+0:00")) -/// // matches Timestamp(Nanosecond, Some("+5:00")) -/// // does not match Timestamp(Nanosecond, None) -/// DataType::Timestamp(TimeUnit::Nanosecond, Some(TIMEZONE_WILDCARD.into())), +/// // A nanosecond precision timestamp with ANY timezone +/// // matches Timestamp(Nanosecond, Some("+0:00")) +/// // matches Timestamp(Nanosecond, Some("+5:00")) +/// // does not match Timestamp(Nanosecond, None) +/// DataType::Timestamp(TimeUnit::Nanosecond, Some(TIMEZONE_WILDCARD.into())), /// ]); /// ``` #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] @@ -245,6 +253,69 @@ impl TypeSignature { pub fn is_one_of(&self) -> bool { matches!(self, TypeSignature::OneOf(_)) } + + /// Returns the arity (expected number of arguments) for this type signature. + /// + /// Returns `Arity::Fixed(n)` for signatures with a specific argument count, + /// or `Arity::Variable` for variable-arity signatures like `Variadic`, `VariadicAny`, `UserDefined`. + /// + /// # Examples + /// + /// ``` + /// # use datafusion_expr_common::signature::{TypeSignature, Arity}; + /// # use arrow::datatypes::DataType; + /// // Exact signature has fixed arity + /// let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]); + /// assert_eq!(sig.arity(), Arity::Fixed(2)); + /// + /// // Variadic signature has variable arity + /// let sig = TypeSignature::VariadicAny; + /// assert_eq!(sig.arity(), Arity::Variable); + /// ``` + pub fn arity(&self) -> Arity { + match self { + TypeSignature::Exact(types) => Arity::Fixed(types.len()), + TypeSignature::Uniform(count, _) => Arity::Fixed(*count), + TypeSignature::Numeric(count) => Arity::Fixed(*count), + TypeSignature::String(count) => Arity::Fixed(*count), + TypeSignature::Comparable(count) => Arity::Fixed(*count), + TypeSignature::Any(count) => Arity::Fixed(*count), + TypeSignature::Coercible(types) => Arity::Fixed(types.len()), + TypeSignature::Nullary => Arity::Fixed(0), + TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments, + .. + }) => Arity::Fixed(arguments.len()), + TypeSignature::ArraySignature(ArrayFunctionSignature::RecursiveArray) => { + Arity::Fixed(1) + } + TypeSignature::ArraySignature(ArrayFunctionSignature::MapArray) => { + Arity::Fixed(1) + } + TypeSignature::OneOf(variants) => { + // If any variant is Variable, the whole OneOf is Variable + let has_variable = variants.iter().any(|v| v.arity() == Arity::Variable); + if has_variable { + return Arity::Variable; + } + // Otherwise, get max arity from all fixed arity variants + let max_arity = variants + .iter() + .filter_map(|v| match v.arity() { + Arity::Fixed(n) => Some(n), + Arity::Variable => None, + }) + .max(); + match max_arity { + Some(n) => Arity::Fixed(n), + None => Arity::Variable, + } + } + TypeSignature::Variadic(_) + | TypeSignature::VariadicAny + | TypeSignature::UserDefined => Arity::Variable, + } + } } /// Represents the class of types that can be used in a function signature. @@ -336,7 +407,7 @@ impl TypeSignatureClass { &self, native_type: &NativeType, origin_type: &DataType, - ) -> datafusion_common::Result { + ) -> Result { match self { TypeSignatureClass::Native(logical_type) => { logical_type.native().default_cast_for(origin_type) @@ -486,6 +557,174 @@ impl TypeSignature { } } + /// Return string representation of the function signature with parameter names. + /// + /// This method is similar to [`Self::to_string_repr`] but uses parameter names + /// instead of types when available. This is useful for generating more helpful + /// error messages. + /// + /// # Arguments + /// * `parameter_names` - Optional slice of parameter names. When provided, these + /// names will be used instead of type names in the output. + /// + /// # Examples + /// ``` + /// # use datafusion_expr_common::signature::TypeSignature; + /// # use arrow::datatypes::DataType; + /// let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]); + /// + /// // Without names: shows types only + /// assert_eq!(sig.to_string_repr_with_names(None), vec!["Int32, Utf8"]); + /// + /// // With names: shows parameter names with types + /// assert_eq!( + /// sig.to_string_repr_with_names(Some(&["id".to_string(), "name".to_string()])), + /// vec!["id: Int32, name: Utf8"] + /// ); + /// ``` + pub fn to_string_repr_with_names( + &self, + parameter_names: Option<&[String]>, + ) -> Vec { + match self { + TypeSignature::Exact(types) => { + if let Some(names) = parameter_names { + vec![names + .iter() + .zip(types.iter()) + .map(|(name, typ)| format!("{name}: {typ}")) + .collect::>() + .join(", ")] + } else { + vec![Self::join_types(types, ", ")] + } + } + TypeSignature::Any(count) => { + if let Some(names) = parameter_names { + vec![names + .iter() + .take(*count) + .map(|name| format!("{name}: Any")) + .collect::>() + .join(", ")] + } else { + vec![std::iter::repeat_n("Any", *count) + .collect::>() + .join(", ")] + } + } + TypeSignature::Uniform(count, types) => { + if let Some(names) = parameter_names { + let type_str = Self::join_types(types, "/"); + vec![names + .iter() + .take(*count) + .map(|name| format!("{name}: {type_str}")) + .collect::>() + .join(", ")] + } else { + self.to_string_repr() + } + } + TypeSignature::Coercible(coercions) => { + if let Some(names) = parameter_names { + vec![names + .iter() + .zip(coercions.iter()) + .map(|(name, coercion)| format!("{name}: {coercion}")) + .collect::>() + .join(", ")] + } else { + vec![Self::join_types(coercions, ", ")] + } + } + TypeSignature::Comparable(count) => { + if let Some(names) = parameter_names { + vec![names + .iter() + .take(*count) + .map(|name| format!("{name}: Comparable")) + .collect::>() + .join(", ")] + } else { + self.to_string_repr() + } + } + TypeSignature::Numeric(count) => { + if let Some(names) = parameter_names { + vec![names + .iter() + .take(*count) + .map(|name| format!("{name}: Numeric")) + .collect::>() + .join(", ")] + } else { + self.to_string_repr() + } + } + TypeSignature::String(count) => { + if let Some(names) = parameter_names { + vec![names + .iter() + .take(*count) + .map(|name| format!("{name}: String")) + .collect::>() + .join(", ")] + } else { + self.to_string_repr() + } + } + TypeSignature::Nullary => self.to_string_repr(), + TypeSignature::ArraySignature(array_sig) => { + if let Some(names) = parameter_names { + match array_sig { + ArrayFunctionSignature::Array { arguments, .. } => { + vec![names + .iter() + .zip(arguments.iter()) + .map(|(name, arg_type)| format!("{name}: {arg_type}")) + .collect::>() + .join(", ")] + } + ArrayFunctionSignature::RecursiveArray => { + vec![names + .iter() + .take(1) + .map(|name| format!("{name}: recursive_array")) + .collect::>() + .join(", ")] + } + ArrayFunctionSignature::MapArray => { + vec![names + .iter() + .take(1) + .map(|name| format!("{name}: map_array")) + .collect::>() + .join(", ")] + } + } + } else { + self.to_string_repr() + } + } + TypeSignature::OneOf(sigs) => sigs + .iter() + .flat_map(|s| s.to_string_repr_with_names(parameter_names)) + .collect(), + TypeSignature::UserDefined => { + if let Some(names) = parameter_names { + vec![names.join(", ")] + } else { + self.to_string_repr() + } + } + // Variable arity signatures cannot use parameter names + TypeSignature::Variadic(_) | TypeSignature::VariadicAny => { + self.to_string_repr() + } + } + } + /// Helper function to join types with specified delimiter. pub fn join_types(types: &[T], delimiter: &str) -> String { types @@ -618,8 +857,8 @@ fn get_data_types(native_type: &NativeType) -> Vec { /// # Examples /// /// ``` +/// use datafusion_common::types::{logical_binary, logical_string, NativeType}; /// use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; -/// use datafusion_common::types::{NativeType, logical_binary, logical_string}; /// /// // Exact coercion that only accepts timestamp types /// let exact = Coercion::new_exact(TypeSignatureClass::Timestamp); @@ -628,7 +867,7 @@ fn get_data_types(native_type: &NativeType) -> Vec { /// let implicit = Coercion::new_implicit( /// TypeSignatureClass::Native(logical_string()), /// vec![TypeSignatureClass::Native(logical_binary())], -/// NativeType::String +/// NativeType::String, /// ); /// ``` /// @@ -804,6 +1043,13 @@ pub struct Signature { pub type_signature: TypeSignature, /// The volatility of the function. See [Volatility] for more information. pub volatility: Volatility, + /// Optional parameter names for the function arguments. + /// + /// If provided, enables named argument notation for function calls (e.g., `func(a => 1, b => 2)`). + /// The length must match the number of arguments defined by `type_signature`. + /// + /// Defaults to `None`, meaning only positional arguments are supported. + pub parameter_names: Option>, } impl Signature { @@ -812,6 +1058,7 @@ impl Signature { Signature { type_signature, volatility, + parameter_names: None, } } /// An arbitrary number of arguments with the same type, from those listed in `common_types`. @@ -819,6 +1066,7 @@ impl Signature { Self { type_signature: TypeSignature::Variadic(common_types), volatility, + parameter_names: None, } } /// User-defined coercion rules for the function. @@ -826,6 +1074,7 @@ impl Signature { Self { type_signature: TypeSignature::UserDefined, volatility, + parameter_names: None, } } @@ -834,6 +1083,7 @@ impl Signature { Self { type_signature: TypeSignature::Numeric(arg_count), volatility, + parameter_names: None, } } @@ -842,6 +1092,7 @@ impl Signature { Self { type_signature: TypeSignature::String(arg_count), volatility, + parameter_names: None, } } @@ -850,6 +1101,7 @@ impl Signature { Self { type_signature: TypeSignature::VariadicAny, volatility, + parameter_names: None, } } /// A fixed number of arguments of the same type, from those listed in `valid_types`. @@ -861,6 +1113,7 @@ impl Signature { Self { type_signature: TypeSignature::Uniform(arg_count, valid_types), volatility, + parameter_names: None, } } /// Exactly matches the types in `exact_types`, in order. @@ -868,6 +1121,7 @@ impl Signature { Signature { type_signature: TypeSignature::Exact(exact_types), volatility, + parameter_names: None, } } @@ -876,6 +1130,7 @@ impl Signature { Self { type_signature: TypeSignature::Coercible(target_types), volatility, + parameter_names: None, } } @@ -884,6 +1139,7 @@ impl Signature { Self { type_signature: TypeSignature::Comparable(arg_count), volatility, + parameter_names: None, } } @@ -891,6 +1147,7 @@ impl Signature { Signature { type_signature: TypeSignature::Nullary, volatility, + parameter_names: None, } } @@ -899,6 +1156,7 @@ impl Signature { Signature { type_signature: TypeSignature::Any(arg_count), volatility, + parameter_names: None, } } @@ -907,6 +1165,7 @@ impl Signature { Signature { type_signature: TypeSignature::OneOf(type_signatures), volatility, + parameter_names: None, } } @@ -923,6 +1182,7 @@ impl Signature { }, ), volatility, + parameter_names: None, } } @@ -939,6 +1199,7 @@ impl Signature { }, ), volatility, + parameter_names: None, } } @@ -956,6 +1217,7 @@ impl Signature { }, ), volatility, + parameter_names: None, } } @@ -980,6 +1242,7 @@ impl Signature { }), ]), volatility, + parameter_names: None, } } @@ -996,6 +1259,7 @@ impl Signature { }, ), volatility, + parameter_names: None, } } @@ -1003,13 +1267,73 @@ impl Signature { pub fn array(volatility: Volatility) -> Self { Signature::arrays(1, Some(ListCoercion::FixedSizedListToList), volatility) } + + /// Add parameter names to this signature, enabling named argument notation. + /// + /// # Example + /// ``` + /// # use datafusion_expr_common::signature::{Signature, Volatility}; + /// # use arrow::datatypes::DataType; + /// let sig = + /// Signature::exact(vec![DataType::Int32, DataType::Utf8], Volatility::Immutable) + /// .with_parameter_names(vec!["count".to_string(), "name".to_string()]); + /// ``` + /// + /// # Errors + /// Returns an error if the number of parameter names doesn't match the signature's arity. + /// For signatures with variable arity (e.g., `Variadic`, `VariadicAny`), parameter names + /// cannot be specified. + pub fn with_parameter_names(mut self, names: Vec>) -> Result { + let names = names.into_iter().map(Into::into).collect::>(); + // Validate that the number of names matches the signature + self.validate_parameter_names(&names)?; + self.parameter_names = Some(names); + Ok(self) + } + + /// Validate that parameter names are compatible with this signature + fn validate_parameter_names(&self, names: &[String]) -> Result<()> { + match self.type_signature.arity() { + Arity::Fixed(expected) => { + if names.len() != expected { + return plan_err!( + "Parameter names count ({}) does not match signature arity ({})", + names.len(), + expected + ); + } + } + Arity::Variable => { + // For UserDefined signatures, allow parameter names + // The function implementer is responsible for validating the names match the actual arguments + if !matches!(self.type_signature, TypeSignature::UserDefined) { + return plan_err!( + "Cannot specify parameter names for variable arity signature: {:?}", + self.type_signature + ); + } + } + } + + let mut seen = std::collections::HashSet::new(); + for name in names { + if !seen.insert(name) { + return plan_err!("Duplicate parameter name: '{}'", name); + } + } + + Ok(()) + } } #[cfg(test)] mod tests { - use datafusion_common::types::{logical_int64, logical_string}; + use datafusion_common::types::{logical_int32, logical_int64, logical_string}; use super::*; + use crate::signature::{ + ArrayFunctionArgument, ArrayFunctionSignature, Coercion, TypeSignatureClass, + }; #[test] fn supports_zero_argument_tests() { @@ -1167,4 +1491,430 @@ mod tests { ] ); } + + #[test] + fn test_signature_with_parameter_names() { + let sig = Signature::exact( + vec![DataType::Int32, DataType::Utf8], + Volatility::Immutable, + ) + .with_parameter_names(vec!["count".to_string(), "name".to_string()]) + .unwrap(); + + assert_eq!( + sig.parameter_names, + Some(vec!["count".to_string(), "name".to_string()]) + ); + assert_eq!( + sig.type_signature, + TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]) + ); + } + + #[test] + fn test_signature_parameter_names_wrong_count() { + let result = Signature::exact( + vec![DataType::Int32, DataType::Utf8], + Volatility::Immutable, + ) + .with_parameter_names(vec!["count".to_string()]); // Only 1 name for 2 args + + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("does not match signature arity")); + } + + #[test] + fn test_signature_parameter_names_duplicate() { + let result = Signature::exact( + vec![DataType::Int32, DataType::Int32], + Volatility::Immutable, + ) + .with_parameter_names(vec!["count".to_string(), "count".to_string()]); + + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Duplicate parameter name")); + } + + #[test] + fn test_signature_parameter_names_variadic() { + let result = Signature::variadic(vec![DataType::Int32], Volatility::Immutable) + .with_parameter_names(vec!["arg".to_string()]); + + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("variable arity signature")); + } + + #[test] + fn test_signature_without_parameter_names() { + let sig = Signature::exact( + vec![DataType::Int32, DataType::Utf8], + Volatility::Immutable, + ); + + assert_eq!(sig.parameter_names, None); + } + + #[test] + fn test_signature_uniform_with_parameter_names() { + let sig = Signature::uniform(3, vec![DataType::Float64], Volatility::Immutable) + .with_parameter_names(vec!["x".to_string(), "y".to_string(), "z".to_string()]) + .unwrap(); + + assert_eq!( + sig.parameter_names, + Some(vec!["x".to_string(), "y".to_string(), "z".to_string()]) + ); + } + + #[test] + fn test_signature_numeric_with_parameter_names() { + let sig = Signature::numeric(2, Volatility::Immutable) + .with_parameter_names(vec!["a".to_string(), "b".to_string()]) + .unwrap(); + + assert_eq!( + sig.parameter_names, + Some(vec!["a".to_string(), "b".to_string()]) + ); + } + + #[test] + fn test_signature_nullary_with_empty_names() { + let sig = Signature::nullary(Volatility::Immutable) + .with_parameter_names(Vec::::new()) + .unwrap(); + + assert_eq!(sig.parameter_names, Some(vec![])); + } + + #[test] + fn test_to_string_repr_with_names_exact() { + let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]); + + assert_eq!(sig.to_string_repr_with_names(None), vec!["Int32, Utf8"]); + + let names = vec!["id".to_string(), "name".to_string()]; + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec!["id: Int32, name: Utf8"] + ); + } + + #[test] + fn test_to_string_repr_with_names_any() { + let sig = TypeSignature::Any(3); + + assert_eq!(sig.to_string_repr_with_names(None), vec!["Any, Any, Any"]); + + let names = vec!["x".to_string(), "y".to_string(), "z".to_string()]; + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec!["x: Any, y: Any, z: Any"] + ); + } + + #[test] + fn test_to_string_repr_with_names_one_of() { + let sig = + TypeSignature::OneOf(vec![TypeSignature::Any(2), TypeSignature::Any(3)]); + + assert_eq!( + sig.to_string_repr_with_names(None), + vec!["Any, Any", "Any, Any, Any"] + ); + + let names = vec![ + "str".to_string(), + "start_pos".to_string(), + "length".to_string(), + ]; + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec![ + "str: Any, start_pos: Any", + "str: Any, start_pos: Any, length: Any" + ] + ); + } + + #[test] + fn test_to_string_repr_with_names_partial() { + // This simulates providing max arity names for a OneOf signature + let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]); + + // Provide 3 names for 2-parameter signature (extra name is ignored via zip) + let names = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec!["a: Int32, b: Utf8"] + ); + } + + #[test] + fn test_to_string_repr_with_names_uniform() { + let sig = TypeSignature::Uniform(2, vec![DataType::Float64]); + + assert_eq!( + sig.to_string_repr_with_names(None), + vec!["Float64, Float64"] + ); + + let names = vec!["x".to_string(), "y".to_string()]; + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec!["x: Float64, y: Float64"] + ); + } + + #[test] + fn test_to_string_repr_with_names_coercible() { + let sig = TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_int32())), + Coercion::new_exact(TypeSignatureClass::Native(logical_int32())), + ]); + + let names = vec!["a".to_string(), "b".to_string()]; + let result = sig.to_string_repr_with_names(Some(&names)); + // Check that it contains the parameter names with type annotations + assert_eq!(result.len(), 1); + assert!(result[0].starts_with("a: ")); + assert!(result[0].contains(", b: ")); + } + + #[test] + fn test_to_string_repr_with_names_comparable_numeric_string() { + let comparable = TypeSignature::Comparable(3); + let numeric = TypeSignature::Numeric(2); + let string_sig = TypeSignature::String(2); + + let names = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + + // All should show parameter names with type annotations + assert_eq!( + comparable.to_string_repr_with_names(Some(&names)), + vec!["a: Comparable, b: Comparable, c: Comparable"] + ); + assert_eq!( + numeric.to_string_repr_with_names(Some(&names)), + vec!["a: Numeric, b: Numeric"] + ); + assert_eq!( + string_sig.to_string_repr_with_names(Some(&names)), + vec!["a: String, b: String"] + ); + } + + #[test] + fn test_to_string_repr_with_names_variadic_fallback() { + let variadic = TypeSignature::Variadic(vec![DataType::Utf8, DataType::LargeUtf8]); + let names = vec!["x".to_string()]; + assert_eq!( + variadic.to_string_repr_with_names(Some(&names)), + variadic.to_string_repr() + ); + + let variadic_any = TypeSignature::VariadicAny; + assert_eq!( + variadic_any.to_string_repr_with_names(Some(&names)), + variadic_any.to_string_repr() + ); + + // UserDefined now shows parameter names when available + let user_defined = TypeSignature::UserDefined; + assert_eq!( + user_defined.to_string_repr_with_names(Some(&names)), + vec!["x"] + ); + assert_eq!( + user_defined.to_string_repr_with_names(None), + user_defined.to_string_repr() + ); + } + + #[test] + fn test_to_string_repr_with_names_nullary() { + let sig = TypeSignature::Nullary; + let names = vec!["x".to_string()]; + + // Should return empty representation, names don't apply + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec!["NullAry()"] + ); + assert_eq!(sig.to_string_repr_with_names(None), vec!["NullAry()"]); + } + + #[test] + fn test_to_string_repr_with_names_array_signature() { + let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ + ArrayFunctionArgument::Array, + ArrayFunctionArgument::Index, + ArrayFunctionArgument::Element, + ], + array_coercion: None, + }); + + assert_eq!( + sig.to_string_repr_with_names(None), + vec!["array, index, element"] + ); + + let names = vec!["arr".to_string(), "idx".to_string(), "val".to_string()]; + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec!["arr: array, idx: index, val: element"] + ); + + let recursive = + TypeSignature::ArraySignature(ArrayFunctionSignature::RecursiveArray); + let names = vec!["array".to_string()]; + assert_eq!( + recursive.to_string_repr_with_names(Some(&names)), + vec!["array: recursive_array"] + ); + + // Test MapArray (1 argument) + let map_array = TypeSignature::ArraySignature(ArrayFunctionSignature::MapArray); + let names = vec!["map".to_string()]; + assert_eq!( + map_array.to_string_repr_with_names(Some(&names)), + vec!["map: map_array"] + ); + } + + #[test] + fn test_type_signature_arity_exact() { + let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]); + assert_eq!(sig.arity(), Arity::Fixed(2)); + + let sig = TypeSignature::Exact(vec![]); + assert_eq!(sig.arity(), Arity::Fixed(0)); + } + + #[test] + fn test_type_signature_arity_uniform() { + let sig = TypeSignature::Uniform(3, vec![DataType::Float64]); + assert_eq!(sig.arity(), Arity::Fixed(3)); + + let sig = TypeSignature::Uniform(1, vec![DataType::Int32]); + assert_eq!(sig.arity(), Arity::Fixed(1)); + } + + #[test] + fn test_type_signature_arity_numeric() { + let sig = TypeSignature::Numeric(2); + assert_eq!(sig.arity(), Arity::Fixed(2)); + } + + #[test] + fn test_type_signature_arity_string() { + let sig = TypeSignature::String(3); + assert_eq!(sig.arity(), Arity::Fixed(3)); + } + + #[test] + fn test_type_signature_arity_comparable() { + let sig = TypeSignature::Comparable(2); + assert_eq!(sig.arity(), Arity::Fixed(2)); + } + + #[test] + fn test_type_signature_arity_any() { + let sig = TypeSignature::Any(4); + assert_eq!(sig.arity(), Arity::Fixed(4)); + } + + #[test] + fn test_type_signature_arity_coercible() { + let sig = TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_int32())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + ]); + assert_eq!(sig.arity(), Arity::Fixed(2)); + } + + #[test] + fn test_type_signature_arity_nullary() { + let sig = TypeSignature::Nullary; + assert_eq!(sig.arity(), Arity::Fixed(0)); + } + + #[test] + fn test_type_signature_arity_array_signature() { + // Test Array variant with 2 arguments + let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ArrayFunctionArgument::Array, ArrayFunctionArgument::Index], + array_coercion: None, + }); + assert_eq!(sig.arity(), Arity::Fixed(2)); + + // Test Array variant with 3 arguments + let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ + ArrayFunctionArgument::Array, + ArrayFunctionArgument::Element, + ArrayFunctionArgument::Index, + ], + array_coercion: None, + }); + assert_eq!(sig.arity(), Arity::Fixed(3)); + + // Test RecursiveArray variant + let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::RecursiveArray); + assert_eq!(sig.arity(), Arity::Fixed(1)); + + // Test MapArray variant + let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::MapArray); + assert_eq!(sig.arity(), Arity::Fixed(1)); + } + + #[test] + fn test_type_signature_arity_one_of_fixed() { + // OneOf with all fixed arity variants should return max arity + let sig = TypeSignature::OneOf(vec![ + TypeSignature::Exact(vec![DataType::Int32]), + TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]), + TypeSignature::Exact(vec![ + DataType::Int32, + DataType::Utf8, + DataType::Float64, + ]), + ]); + assert_eq!(sig.arity(), Arity::Fixed(3)); + } + + #[test] + fn test_type_signature_arity_one_of_variable() { + // OneOf with variable arity variant should return Variable + let sig = TypeSignature::OneOf(vec![ + TypeSignature::Exact(vec![DataType::Int32]), + TypeSignature::VariadicAny, + ]); + assert_eq!(sig.arity(), Arity::Variable); + } + + #[test] + fn test_type_signature_arity_variadic() { + let sig = TypeSignature::Variadic(vec![DataType::Int32]); + assert_eq!(sig.arity(), Arity::Variable); + + let sig = TypeSignature::VariadicAny; + assert_eq!(sig.arity(), Arity::Variable); + } + + #[test] + fn test_type_signature_arity_user_defined() { + let sig = TypeSignature::UserDefined; + assert_eq!(sig.arity(), Arity::Variable); + } } diff --git a/datafusion/expr/src/arguments.rs b/datafusion/expr/src/arguments.rs new file mode 100644 index 000000000000..5653993db98f --- /dev/null +++ b/datafusion/expr/src/arguments.rs @@ -0,0 +1,285 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Argument resolution logic for named function parameters + +use crate::Expr; +use datafusion_common::{plan_err, Result}; +use std::collections::HashMap; + +/// Resolves function arguments, handling named and positional notation. +/// +/// This function validates and reorders arguments to match the function's parameter names +/// when named arguments are used. +/// +/// # Rules +/// - All positional arguments must come before named arguments +/// - Named arguments can be in any order after positional arguments +/// - Parameter names follow SQL identifier rules: unquoted names are case-insensitive +/// (normalized to lowercase), quoted names are case-sensitive +/// - No duplicate parameter names allowed +/// +/// # Arguments +/// * `param_names` - The function's parameter names in order +/// * `args` - The argument expressions +/// * `arg_names` - Optional parameter name for each argument +/// +/// # Returns +/// A vector of expressions in the correct order matching the parameter names +/// +/// # Examples +/// ```text +/// Given parameters ["a", "b", "c"] +/// And call: func(10, c => 30, b => 20) +/// Returns: [Expr(10), Expr(20), Expr(30)] +/// ``` +pub fn resolve_function_arguments( + param_names: &[String], + args: Vec, + arg_names: Vec>, +) -> Result> { + if args.len() != arg_names.len() { + return plan_err!( + "Internal error: args length ({}) != arg_names length ({})", + args.len(), + arg_names.len() + ); + } + + // Check if all arguments are positional (fast path) + if arg_names.iter().all(|name| name.is_none()) { + return Ok(args); + } + + validate_argument_order(&arg_names)?; + + reorder_named_arguments(param_names, args, arg_names) +} + +/// Validates that positional arguments come before named arguments +fn validate_argument_order(arg_names: &[Option]) -> Result<()> { + let mut seen_named = false; + for (i, arg_name) in arg_names.iter().enumerate() { + match arg_name { + Some(_) => seen_named = true, + None if seen_named => { + return plan_err!( + "Positional argument at position {} follows named argument. \ + All positional arguments must come before named arguments.", + i + ); + } + None => {} + } + } + Ok(()) +} + +/// Reorders arguments based on named parameters to match signature order +fn reorder_named_arguments( + param_names: &[String], + args: Vec, + arg_names: Vec>, +) -> Result> { + // Build HashMap for O(1) parameter name lookups + let param_index_map: HashMap<&str, usize> = param_names + .iter() + .enumerate() + .map(|(idx, name)| (name.as_str(), idx)) + .collect(); + + let positional_count = arg_names.iter().filter(|n| n.is_none()).count(); + + // Capture args length before consuming the vector + let args_len = args.len(); + + let expected_arg_count = param_names.len(); + + if positional_count > expected_arg_count { + return plan_err!( + "Too many positional arguments: expected at most {}, got {}", + expected_arg_count, + positional_count + ); + } + + let mut result: Vec> = vec![None; expected_arg_count]; + + for (i, (arg, arg_name)) in args.into_iter().zip(arg_names).enumerate() { + if let Some(name) = arg_name { + // Named argument - O(1) lookup in HashMap + let param_index = + param_index_map.get(name.as_str()).copied().ok_or_else(|| { + datafusion_common::plan_datafusion_err!( + "Unknown parameter name '{}'. Valid parameters are: [{}]", + name, + param_names.join(", ") + ) + })?; + + if result[param_index].is_some() { + return plan_err!("Parameter '{}' specified multiple times", name); + } + + result[param_index] = Some(arg); + } else { + result[i] = Some(arg); + } + } + + // Only require parameters up to the number of arguments provided (supports optional parameters) + let required_count = args_len; + for i in 0..required_count { + if result[i].is_none() { + return plan_err!("Missing required parameter '{}'", param_names[i]); + } + } + + // Return only the assigned parameters (handles optional trailing parameters) + Ok(result.into_iter().take(required_count).flatten().collect()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::lit; + + #[test] + fn test_all_positional() { + let param_names = vec!["a".to_string(), "b".to_string()]; + + let args = vec![lit(1), lit("hello")]; + let arg_names = vec![None, None]; + + let result = + resolve_function_arguments(¶m_names, args.clone(), arg_names).unwrap(); + assert_eq!(result.len(), 2); + } + + #[test] + fn test_all_named() { + let param_names = vec!["a".to_string(), "b".to_string()]; + + let args = vec![lit(1), lit("hello")]; + let arg_names = vec![Some("a".to_string()), Some("b".to_string())]; + + let result = resolve_function_arguments(¶m_names, args, arg_names).unwrap(); + assert_eq!(result.len(), 2); + } + + #[test] + fn test_named_reordering() { + let param_names = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + + // Call with: func(c => 3.0, a => 1, b => "hello") + let args = vec![lit(3.0), lit(1), lit("hello")]; + let arg_names = vec![ + Some("c".to_string()), + Some("a".to_string()), + Some("b".to_string()), + ]; + + let result = resolve_function_arguments(¶m_names, args, arg_names).unwrap(); + + // Should be reordered to [a, b, c] = [1, "hello", 3.0] + assert_eq!(result.len(), 3); + assert_eq!(result[0], lit(1)); + assert_eq!(result[1], lit("hello")); + assert_eq!(result[2], lit(3.0)); + } + + #[test] + fn test_mixed_positional_and_named() { + let param_names = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + + // Call with: func(1, c => 3.0, b => "hello") + let args = vec![lit(1), lit(3.0), lit("hello")]; + let arg_names = vec![None, Some("c".to_string()), Some("b".to_string())]; + + let result = resolve_function_arguments(¶m_names, args, arg_names).unwrap(); + + // Should be reordered to [a, b, c] = [1, "hello", 3.0] + assert_eq!(result.len(), 3); + assert_eq!(result[0], lit(1)); + assert_eq!(result[1], lit("hello")); + assert_eq!(result[2], lit(3.0)); + } + + #[test] + fn test_positional_after_named_error() { + let param_names = vec!["a".to_string(), "b".to_string()]; + + // Call with: func(a => 1, "hello") - ERROR + let args = vec![lit(1), lit("hello")]; + let arg_names = vec![Some("a".to_string()), None]; + + let result = resolve_function_arguments(¶m_names, args, arg_names); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Positional argument")); + } + + #[test] + fn test_unknown_parameter_name() { + let param_names = vec!["a".to_string(), "b".to_string()]; + + // Call with: func(x => 1, b => "hello") - ERROR + let args = vec![lit(1), lit("hello")]; + let arg_names = vec![Some("x".to_string()), Some("b".to_string())]; + + let result = resolve_function_arguments(¶m_names, args, arg_names); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Unknown parameter")); + } + + #[test] + fn test_duplicate_parameter_name() { + let param_names = vec!["a".to_string(), "b".to_string()]; + + // Call with: func(a => 1, a => 2) - ERROR + let args = vec![lit(1), lit(2)]; + let arg_names = vec![Some("a".to_string()), Some("a".to_string())]; + + let result = resolve_function_arguments(¶m_names, args, arg_names); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("specified multiple times")); + } + + #[test] + fn test_missing_required_parameter() { + let param_names = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + + // Call with: func(a => 1, c => 3.0) - missing 'b' + let args = vec![lit(1), lit(3.0)]; + let arg_names = vec![Some("a".to_string()), Some("c".to_string())]; + + let result = resolve_function_arguments(¶m_names, args, arg_names); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Missing required parameter")); + } +} diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 282b3f6a0f55..94dcd2a86150 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -18,7 +18,7 @@ //! Logical Expressions: [`Expr`] use std::cmp::Ordering; -use std::collections::{BTreeMap, HashSet}; +use std::collections::HashSet; use std::fmt::{self, Display, Formatter, Write}; use std::hash::{Hash, Hasher}; use std::mem; @@ -45,6 +45,10 @@ use sqlparser::ast::{ RenameSelectItem, ReplaceSelectElement, }; +// Moved in 51.0.0 to datafusion_common +pub use datafusion_common::metadata::FieldMetadata; +use datafusion_common::metadata::ScalarAndMetadata; + // This mirrors sqlparser::ast::NullTreatment but we need our own variant // for when the sql feature is disabled. #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Ord, PartialOrd)] @@ -160,11 +164,11 @@ impl From for NullTreatment { /// # use datafusion_expr::{lit, col, Operator, Expr}; /// // Use the `+` operator to add two columns together /// let expr = col("c1") + col("c2"); -/// assert!(matches!(expr, Expr::BinaryExpr { ..} )); +/// assert!(matches!(expr, Expr::BinaryExpr { .. })); /// if let Expr::BinaryExpr(binary_expr) = expr { -/// assert_eq!(*binary_expr.left, col("c1")); -/// assert_eq!(*binary_expr.right, col("c2")); -/// assert_eq!(binary_expr.op, Operator::Plus); +/// assert_eq!(*binary_expr.left, col("c1")); +/// assert_eq!(*binary_expr.right, col("c2")); +/// assert_eq!(binary_expr.op, Operator::Plus); /// } /// ``` /// @@ -175,12 +179,12 @@ impl From for NullTreatment { /// # use datafusion_common::ScalarValue; /// # use datafusion_expr::{lit, col, Operator, Expr}; /// let expr = col("c1").eq(lit(42_i32)); -/// assert!(matches!(expr, Expr::BinaryExpr { .. } )); +/// assert!(matches!(expr, Expr::BinaryExpr { .. })); /// if let Expr::BinaryExpr(binary_expr) = expr { -/// assert_eq!(*binary_expr.left, col("c1")); -/// let scalar = ScalarValue::Int32(Some(42)); -/// assert_eq!(*binary_expr.right, Expr::Literal(scalar, None)); -/// assert_eq!(binary_expr.op, Operator::Eq); +/// assert_eq!(*binary_expr.left, col("c1")); +/// let scalar = ScalarValue::Int32(Some(42)); +/// assert_eq!(*binary_expr.right, Expr::Literal(scalar, None)); +/// assert_eq!(binary_expr.op, Operator::Eq); /// } /// ``` /// @@ -193,22 +197,22 @@ impl From for NullTreatment { /// # use datafusion_expr::Expr; /// // Create a schema c1(int, c2 float) /// let arrow_schema = Schema::new(vec![ -/// Field::new("c1", DataType::Int32, false), -/// Field::new("c2", DataType::Float64, false), +/// Field::new("c1", DataType::Int32, false), +/// Field::new("c2", DataType::Float64, false), /// ]); /// // DFSchema is a an Arrow schema with optional relation name -/// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema) -/// .unwrap(); +/// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema).unwrap(); /// /// // Form Vec with an expression for each column in the schema -/// let exprs: Vec<_> = df_schema.iter() -/// .map(Expr::from) -/// .collect(); -/// -/// assert_eq!(exprs, vec![ -/// Expr::from(Column::from_qualified_name("t1.c1")), -/// Expr::from(Column::from_qualified_name("t1.c2")), -/// ]); +/// let exprs: Vec<_> = df_schema.iter().map(Expr::from).collect(); +/// +/// assert_eq!( +/// exprs, +/// vec![ +/// Expr::from(Column::from_qualified_name("t1.c1")), +/// Expr::from(Column::from_qualified_name("t1.c2")), +/// ] +/// ); /// ``` /// /// # Examples: Displaying `Exprs` @@ -269,12 +273,13 @@ impl From for NullTreatment { /// let mut scalars = HashSet::new(); /// // apply recursively visits all nodes in the expression tree /// expr.apply(|e| { -/// if let Expr::Literal(scalar, _) = e { -/// scalars.insert(scalar); -/// } -/// // The return value controls whether to continue visiting the tree -/// Ok(TreeNodeRecursion::Continue) -/// }).unwrap(); +/// if let Expr::Literal(scalar, _) = e { +/// scalars.insert(scalar); +/// } +/// // The return value controls whether to continue visiting the tree +/// Ok(TreeNodeRecursion::Continue) +/// }) +/// .unwrap(); /// // All subtrees have been visited and literals found /// assert_eq!(scalars.len(), 2); /// assert!(scalars.contains(&ScalarValue::Int32(Some(5)))); @@ -421,6 +426,14 @@ impl From for Expr { } } +/// Create an [`Expr`] from an [`ScalarAndMetadata`] +impl From for Expr { + fn from(value: ScalarAndMetadata) -> Self { + let (value, metadata) = value.into_inner(); + Expr::Literal(value, metadata) + } +} + /// Create an [`Expr`] from an optional qualifier and a [`FieldRef`]. This is /// useful for creating [`Expr`] from a [`DFSchema`]. /// @@ -447,235 +460,6 @@ impl<'a> TreeNodeContainer<'a, Self> for Expr { } } -/// Literal metadata -/// -/// Stores metadata associated with a literal expressions -/// and is designed to be fast to `clone`. -/// -/// This structure is used to store metadata associated with a literal expression, and it -/// corresponds to the `metadata` field on [`Field`]. -/// -/// # Example: Create [`FieldMetadata`] from a [`Field`] -/// ``` -/// # use std::collections::HashMap; -/// # use datafusion_expr::expr::FieldMetadata; -/// # use arrow::datatypes::{Field, DataType}; -/// # let field = Field::new("c1", DataType::Int32, true) -/// # .with_metadata(HashMap::from([("foo".to_string(), "bar".to_string())])); -/// // Create a new `FieldMetadata` instance from a `Field` -/// let metadata = FieldMetadata::new_from_field(&field); -/// // There is also a `From` impl: -/// let metadata = FieldMetadata::from(&field); -/// ``` -/// -/// # Example: Update a [`Field`] with [`FieldMetadata`] -/// ``` -/// # use datafusion_expr::expr::FieldMetadata; -/// # use arrow::datatypes::{Field, DataType}; -/// # let field = Field::new("c1", DataType::Int32, true); -/// # let metadata = FieldMetadata::new_from_field(&field); -/// // Add any metadata from `FieldMetadata` to `Field` -/// let updated_field = metadata.add_to_field(field); -/// ``` -/// -#[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] -pub struct FieldMetadata { - /// The inner metadata of a literal expression, which is a map of string - /// keys to string values. - /// - /// Note this is not a `HashMap` because `HashMap` does not provide - /// implementations for traits like `Debug` and `Hash`. - inner: Arc>, -} - -impl Default for FieldMetadata { - fn default() -> Self { - Self::new_empty() - } -} - -impl FieldMetadata { - /// Create a new empty metadata instance. - pub fn new_empty() -> Self { - Self { - inner: Arc::new(BTreeMap::new()), - } - } - - /// Merges two optional `FieldMetadata` instances, overwriting any existing - /// keys in `m` with keys from `n` if present. - /// - /// This function is commonly used in alias operations, particularly for literals - /// with metadata. When creating an alias expression, the metadata from the original - /// expression (such as a literal) is combined with any metadata specified on the alias. - /// - /// # Arguments - /// - /// * `m` - The first metadata (typically from the original expression like a literal) - /// * `n` - The second metadata (typically from the alias definition) - /// - /// # Merge Strategy - /// - /// - If both metadata instances exist, they are merged with `n` taking precedence - /// - Keys from `n` will overwrite keys from `m` if they have the same name - /// - If only one metadata instance exists, it is returned unchanged - /// - If neither exists, `None` is returned - /// - /// # Example usage - /// ```rust - /// use datafusion_expr::expr::FieldMetadata; - /// use std::collections::BTreeMap; - /// - /// // Create metadata for a literal expression - /// let literal_metadata = Some(FieldMetadata::from(BTreeMap::from([ - /// ("source".to_string(), "constant".to_string()), - /// ("type".to_string(), "int".to_string()), - /// ]))); - /// - /// // Create metadata for an alias - /// let alias_metadata = Some(FieldMetadata::from(BTreeMap::from([ - /// ("description".to_string(), "answer".to_string()), - /// ("source".to_string(), "user".to_string()), // This will override literal's "source" - /// ]))); - /// - /// // Merge the metadata - /// let merged = FieldMetadata::merge_options( - /// literal_metadata.as_ref(), - /// alias_metadata.as_ref(), - /// ); - /// - /// // Result contains: {"source": "user", "type": "int", "description": "answer"} - /// assert!(merged.is_some()); - /// ``` - pub fn merge_options( - m: Option<&FieldMetadata>, - n: Option<&FieldMetadata>, - ) -> Option { - match (m, n) { - (Some(m), Some(n)) => { - let mut merged = m.clone(); - merged.extend(n.clone()); - Some(merged) - } - (Some(m), None) => Some(m.clone()), - (None, Some(n)) => Some(n.clone()), - (None, None) => None, - } - } - - /// Create a new metadata instance from a `Field`'s metadata. - pub fn new_from_field(field: &Field) -> Self { - let inner = field - .metadata() - .iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect(); - Self { - inner: Arc::new(inner), - } - } - - /// Create a new metadata instance from a map of string keys to string values. - pub fn new(inner: BTreeMap) -> Self { - Self { - inner: Arc::new(inner), - } - } - - /// Get the inner metadata as a reference to a `BTreeMap`. - pub fn inner(&self) -> &BTreeMap { - &self.inner - } - - /// Return the inner metadata - pub fn into_inner(self) -> Arc> { - self.inner - } - - /// Adds metadata from `other` into `self`, overwriting any existing keys. - pub fn extend(&mut self, other: Self) { - if other.is_empty() { - return; - } - let other = Arc::unwrap_or_clone(other.into_inner()); - Arc::make_mut(&mut self.inner).extend(other); - } - - /// Returns true if the metadata is empty. - pub fn is_empty(&self) -> bool { - self.inner.is_empty() - } - - /// Returns the number of key-value pairs in the metadata. - pub fn len(&self) -> usize { - self.inner.len() - } - - /// Convert this `FieldMetadata` into a `HashMap` - pub fn to_hashmap(&self) -> std::collections::HashMap { - self.inner - .iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect() - } - - /// Updates the metadata on the Field with this metadata, if it is not empty. - pub fn add_to_field(&self, field: Field) -> Field { - if self.inner.is_empty() { - return field; - } - - field.with_metadata(self.to_hashmap()) - } -} - -impl From<&Field> for FieldMetadata { - fn from(field: &Field) -> Self { - Self::new_from_field(field) - } -} - -impl From> for FieldMetadata { - fn from(inner: BTreeMap) -> Self { - Self::new(inner) - } -} - -impl From> for FieldMetadata { - fn from(map: std::collections::HashMap) -> Self { - Self::new(map.into_iter().collect()) - } -} - -/// From reference -impl From<&std::collections::HashMap> for FieldMetadata { - fn from(map: &std::collections::HashMap) -> Self { - let inner = map - .iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect(); - Self::new(inner) - } -} - -/// From hashbrown map -impl From> for FieldMetadata { - fn from(map: HashMap) -> Self { - let inner = map.into_iter().collect(); - Self::new(inner) - } -} - -impl From<&HashMap> for FieldMetadata { - fn from(map: &HashMap) -> Self { - let inner = map - .into_iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect(); - Self::new(inner) - } -} - /// The metadata used in [`Field::metadata`]. /// /// This represents the metadata associated with an Arrow [`Field`]. The metadata consists of key-value pairs. @@ -1370,13 +1154,22 @@ pub struct Placeholder { /// The identifier of the parameter, including the leading `$` (e.g, `"$1"` or `"$foo"`) pub id: String, /// The type the parameter will be filled in with - pub data_type: Option, + pub field: Option, } impl Placeholder { /// Create a new Placeholder expression + #[deprecated(since = "51.0.0", note = "Use new_with_field instead")] pub fn new(id: String, data_type: Option) -> Self { - Self { id, data_type } + Self { + id, + field: data_type.map(|dt| Arc::new(Field::new("", dt, true))), + } + } + + /// Create a new Placeholder expression from a Field + pub fn new_with_field(id: String, field: Option) -> Self { + Self { id, field } } } @@ -1843,12 +1636,11 @@ impl Expr { /// ``` /// # use datafusion_expr::col; /// # use std::collections::HashMap; - /// # use datafusion_expr::expr::FieldMetadata; + /// # use datafusion_common::metadata::FieldMetadata; /// let metadata = HashMap::from([("key".to_string(), "value".to_string())]); /// let metadata = FieldMetadata::from(metadata); /// let expr = col("foo").alias_with_metadata("bar", Some(metadata)); /// ``` - /// pub fn alias_with_metadata( self, name: impl Into, @@ -1875,12 +1667,12 @@ impl Expr { /// ``` /// # use datafusion_expr::col; /// # use std::collections::HashMap; - /// # use datafusion_expr::expr::FieldMetadata; + /// # use datafusion_common::metadata::FieldMetadata; /// let metadata = HashMap::from([("key".to_string(), "value".to_string())]); /// let metadata = FieldMetadata::from(metadata); - /// let expr = col("foo").alias_qualified_with_metadata(Some("tbl"), "bar", Some(metadata)); + /// let expr = + /// col("foo").alias_qualified_with_metadata(Some("tbl"), "bar", Some(metadata)); /// ``` - /// pub fn alias_qualified_with_metadata( self, relation: Option>, @@ -2886,19 +2678,23 @@ impl HashNode for Expr { } } -// Modifies expr if it is a placeholder with datatype of right +// Modifies expr to match the DataType, metadata, and nullability of other if it is +// a placeholder with previously unspecified type information (i.e., most placeholders) fn rewrite_placeholder(expr: &mut Expr, other: &Expr, schema: &DFSchema) -> Result<()> { - if let Expr::Placeholder(Placeholder { id: _, data_type }) = expr { - if data_type.is_none() { - let other_dt = other.get_type(schema); - match other_dt { + if let Expr::Placeholder(Placeholder { id: _, field }) = expr { + if field.is_none() { + let other_field = other.to_field(schema); + match other_field { Err(e) => { Err(e.context(format!( "Can not find type of {other} needed to infer type of {expr}" )))?; } - Ok(dt) => { - *data_type = Some(dt); + Ok((_, other_field)) => { + // We can't infer the nullability of the future parameter that might + // be bound, so ensure this is set to true + *field = + Some(other_field.as_ref().clone().with_nullable(true).into()); } } }; @@ -3715,8 +3511,8 @@ pub fn physical_name(expr: &Expr) -> Result { mod test { use crate::expr_fn::col; use crate::{ - case, lit, qualified_wildcard, wildcard, wildcard_with_options, ColumnarValue, - ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Volatility, + case, lit, placeholder, qualified_wildcard, wildcard, wildcard_with_options, + ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Volatility, }; use arrow::datatypes::{Field, Schema}; use sqlparser::ast; @@ -3730,15 +3526,15 @@ mod test { let param_placeholders = vec![ Expr::Placeholder(Placeholder { id: "$1".to_string(), - data_type: None, + field: None, }), Expr::Placeholder(Placeholder { id: "$2".to_string(), - data_type: None, + field: None, }), Expr::Placeholder(Placeholder { id: "$3".to_string(), - data_type: None, + field: None, }), ]; let in_list = Expr::InList(InList { @@ -3764,8 +3560,8 @@ mod test { match expr { Expr::Placeholder(placeholder) => { assert_eq!( - placeholder.data_type, - Some(DataType::Int32), + placeholder.field.unwrap().data_type(), + &DataType::Int32, "Placeholder {} should infer Int32", placeholder.id ); @@ -3789,7 +3585,7 @@ mod test { expr: Box::new(col("name")), pattern: Box::new(Expr::Placeholder(Placeholder { id: "$1".to_string(), - data_type: None, + field: None, })), negated: false, case_insensitive: false, @@ -3802,7 +3598,7 @@ mod test { match inferred_expr { Expr::Like(like) => match *like.pattern { Expr::Placeholder(placeholder) => { - assert_eq!(placeholder.data_type, Some(DataType::Utf8)); + assert_eq!(placeholder.field.unwrap().data_type(), &DataType::Utf8); } _ => panic!("Expected Placeholder"), }, @@ -3817,8 +3613,8 @@ mod test { Expr::SimilarTo(like) => match *like.pattern { Expr::Placeholder(placeholder) => { assert_eq!( - placeholder.data_type, - Some(DataType::Utf8), + placeholder.field.unwrap().data_type(), + &DataType::Utf8, "Placeholder {} should infer Utf8", placeholder.id ); @@ -3829,6 +3625,39 @@ mod test { } } + #[test] + fn infer_placeholder_with_metadata() { + // name == $1, where name is a non-nullable string + let schema = + Arc::new(Schema::new(vec![Field::new("name", DataType::Utf8, false) + .with_metadata( + [("some_key".to_string(), "some_value".to_string())].into(), + )])); + let df_schema = DFSchema::try_from(schema).unwrap(); + + let expr = binary_expr(col("name"), Operator::Eq, placeholder("$1")); + + let (inferred_expr, _) = expr.infer_placeholder_types(&df_schema).unwrap(); + match inferred_expr { + Expr::BinaryExpr(BinaryExpr { right, .. }) => match *right { + Expr::Placeholder(placeholder) => { + assert_eq!( + placeholder.field.as_ref().unwrap().data_type(), + &DataType::Utf8 + ); + assert_eq!( + placeholder.field.as_ref().unwrap().metadata(), + df_schema.field(0).metadata() + ); + // Inferred placeholder should still be nullable + assert!(placeholder.field.as_ref().unwrap().is_nullable()); + } + _ => panic!("Expected Placeholder"), + }, + _ => panic!("Expected BinaryExpr"), + } + } + #[test] fn format_case_when() -> Result<()> { let expr = case(col("a")) diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 4666411dd540..c777c4978f99 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -119,13 +119,13 @@ pub fn ident(name: impl Into) -> Expr { /// /// ```rust /// # use datafusion_expr::{placeholder}; -/// let p = placeholder("$0"); // $0, refers to parameter 1 -/// assert_eq!(p.to_string(), "$0") +/// let p = placeholder("$1"); // $1, refers to parameter 1 +/// assert_eq!(p.to_string(), "$1") /// ``` pub fn placeholder(id: impl Into) -> Expr { Expr::Placeholder(Placeholder { id: id.into(), - data_type: None, + field: None, }) } diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index e803e3534130..9e8d6080b82c 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -17,8 +17,8 @@ use super::{Between, Expr, Like}; use crate::expr::{ - AggregateFunction, AggregateFunctionParams, Alias, BinaryExpr, Cast, FieldMetadata, - InList, InSubquery, Placeholder, ScalarFunction, TryCast, Unnest, WindowFunction, + AggregateFunction, AggregateFunctionParams, Alias, BinaryExpr, Cast, InList, + InSubquery, Placeholder, ScalarFunction, TryCast, Unnest, WindowFunction, WindowFunctionParams, }; use crate::type_coercion::functions::{ @@ -28,6 +28,7 @@ use crate::udf::ReturnFieldArgs; use crate::{utils, LogicalPlan, Projection, Subquery, WindowFunctionDefinition}; use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field, FieldRef}; +use datafusion_common::metadata::FieldMetadata; use datafusion_common::{ not_impl_err, plan_datafusion_err, plan_err, Column, DataFusionError, ExprSchema, Result, Spans, TableReference, @@ -81,15 +82,17 @@ impl ExprSchemable for Expr { /// # use std::collections::HashMap; /// /// fn main() { - /// let expr = col("c1") + col("c2"); - /// let schema = DFSchema::from_unqualified_fields( - /// vec![ - /// Field::new("c1", DataType::Int32, true), - /// Field::new("c2", DataType::Float32, true), - /// ].into(), - /// HashMap::new(), - /// ).unwrap(); - /// assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); + /// let expr = col("c1") + col("c2"); + /// let schema = DFSchema::from_unqualified_fields( + /// vec![ + /// Field::new("c1", DataType::Int32, true), + /// Field::new("c2", DataType::Float32, true), + /// ] + /// .into(), + /// HashMap::new(), + /// ) + /// .unwrap(); + /// assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); /// } /// ``` /// @@ -104,9 +107,9 @@ impl ExprSchemable for Expr { fn get_type(&self, schema: &dyn ExprSchema) -> Result { match self { Expr::Alias(Alias { expr, name, .. }) => match &**expr { - Expr::Placeholder(Placeholder { data_type, .. }) => match &data_type { + Expr::Placeholder(Placeholder { field, .. }) => match &field { None => schema.data_type(&Column::from_name(name)).cloned(), - Some(dt) => Ok(dt.clone()), + Some(field) => Ok(field.data_type().clone()), }, _ => expr.get_type(schema), }, @@ -211,9 +214,9 @@ impl ExprSchemable for Expr { ) .get_result_type(), Expr::Like { .. } | Expr::SimilarTo { .. } => Ok(DataType::Boolean), - Expr::Placeholder(Placeholder { data_type, .. }) => { - if let Some(dtype) = data_type { - Ok(dtype.clone()) + Expr::Placeholder(Placeholder { field, .. }) => { + if let Some(field) = field { + Ok(field.data_type().clone()) } else { // If the placeholder's type hasn't been specified, treat it as // null (unspecified placeholders generate an error during planning) @@ -309,10 +312,12 @@ impl ExprSchemable for Expr { window_function, ) .map(|(_, nullable)| nullable), - Expr::ScalarVariable(_, _) - | Expr::TryCast { .. } - | Expr::Unnest(_) - | Expr::Placeholder(_) => Ok(true), + Expr::Placeholder(Placeholder { id: _, field }) => { + Ok(field.as_ref().map(|f| f.is_nullable()).unwrap_or(true)) + } + Expr::ScalarVariable(_, _) | Expr::TryCast { .. } | Expr::Unnest(_) => { + Ok(true) + } Expr::IsNull(_) | Expr::IsNotNull(_) | Expr::IsTrue(_) @@ -428,25 +433,11 @@ impl ExprSchemable for Expr { let field = match self { Expr::Alias(Alias { expr, - name, + name: _, metadata, .. }) => { - let field = match &**expr { - Expr::Placeholder(Placeholder { data_type, .. }) => { - match &data_type { - None => schema - .data_type_and_nullable(&Column::from_name(name)) - .map(|(d, n)| Field::new(&schema_name, d.clone(), n)), - Some(dt) => Ok(Field::new( - &schema_name, - dt.clone(), - expr.nullable(schema)?, - )), - } - } - _ => expr.to_field(schema).map(|(_, f)| f.as_ref().clone()), - }?; + let field = expr.to_field(schema).map(|(_, f)| f.as_ref().clone())?; let mut combined_metadata = expr.metadata(schema)?; if let Some(metadata) = metadata { @@ -594,6 +585,10 @@ impl ExprSchemable for Expr { .to_field(schema) .map(|(_, f)| f.as_ref().clone().with_data_type(data_type.clone())) .map(Arc::new), + Expr::Placeholder(Placeholder { + id: _, + field: Some(field), + }) => Ok(field.as_ref().clone().with_name(&schema_name).into()), Expr::Like(_) | Expr::SimilarTo(_) | Expr::Not(_) @@ -741,7 +736,6 @@ impl Expr { /// new projection with the casted expression. /// 2. **Non-projection plan**: If the subquery isn't a projection, it adds a projection to the plan /// with the casted first column. -/// pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result { if subquery.subquery.schema().field(0).data_type() == cast_to_type { return Ok(subquery); @@ -776,10 +770,12 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result {{ @@ -905,7 +901,7 @@ mod tests { let schema = DFSchema::from_unqualified_fields( vec![meta.add_to_field(Field::new("foo", DataType::Int32, true))].into(), - std::collections::HashMap::new(), + HashMap::new(), ) .unwrap(); @@ -921,6 +917,52 @@ mod tests { assert_eq!(meta, outer_ref.metadata(&schema).unwrap()); } + #[test] + fn test_expr_placeholder() { + let schema = MockExprSchema::new(); + + let mut placeholder_meta = HashMap::new(); + placeholder_meta.insert("bar".to_string(), "buzz".to_string()); + let placeholder_meta = FieldMetadata::from(placeholder_meta); + + let expr = Expr::Placeholder(Placeholder::new_with_field( + "".to_string(), + Some( + Field::new("", DataType::Utf8, true) + .with_metadata(placeholder_meta.to_hashmap()) + .into(), + ), + )); + + assert_eq!( + expr.data_type_and_nullable(&schema).unwrap(), + (DataType::Utf8, true) + ); + assert_eq!(placeholder_meta, expr.metadata(&schema).unwrap()); + + let expr_alias = expr.alias("a placeholder by any other name"); + assert_eq!( + expr_alias.data_type_and_nullable(&schema).unwrap(), + (DataType::Utf8, true) + ); + assert_eq!(placeholder_meta, expr_alias.metadata(&schema).unwrap()); + + // Non-nullable placeholder field should remain non-nullable + let expr = Expr::Placeholder(Placeholder::new_with_field( + "".to_string(), + Some(Field::new("", DataType::Utf8, false).into()), + )); + assert_eq!( + expr.data_type_and_nullable(&schema).unwrap(), + (DataType::Utf8, false) + ); + let expr_alias = expr.alias("a placeholder by any other name"); + assert_eq!( + expr_alias.data_type_and_nullable(&schema).unwrap(), + (DataType::Utf8, false) + ); + } + #[derive(Debug)] struct MockExprSchema { field: Field, diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index 346d373ff5b4..2b7cc9d46ad3 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -44,6 +44,7 @@ mod udaf; mod udf; mod udwf; +pub mod arguments; pub mod conditional_expressions; pub mod execution_props; pub mod expr; diff --git a/datafusion/expr/src/literal.rs b/datafusion/expr/src/literal.rs index c4bd43bc0a62..335d7b471f5f 100644 --- a/datafusion/expr/src/literal.rs +++ b/datafusion/expr/src/literal.rs @@ -17,9 +17,8 @@ //! Literal module contains foundational types that are used to represent literals in DataFusion. -use crate::expr::FieldMetadata; use crate::Expr; -use datafusion_common::ScalarValue; +use datafusion_common::{metadata::FieldMetadata, ScalarValue}; /// Create a literal expression pub fn lit(n: T) -> Expr { diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 7a283b0420d3..b9afd894d77d 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -25,7 +25,7 @@ use std::iter::once; use std::sync::Arc; use crate::dml::CopyTo; -use crate::expr::{Alias, FieldMetadata, PlannedReplaceSelectItem, Sort as SortExpr}; +use crate::expr::{Alias, PlannedReplaceSelectItem, Sort as SortExpr}; use crate::expr_rewriter::{ coerce_plan_expr_for_schema, normalize_col, normalize_col_with_schemas_and_ambiguity_check, normalize_cols, normalize_sorts, @@ -50,9 +50,10 @@ use crate::{ use super::dml::InsertOp; use arrow::compute::can_cast_types; -use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; +use arrow::datatypes::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::file_options::file_type::FileType; +use datafusion_common::metadata::FieldMetadata; use datafusion_common::{ exec_err, get_target_functional_dependencies, internal_datafusion_err, not_impl_err, plan_datafusion_err, plan_err, Column, Constraints, DFSchema, DFSchemaRef, @@ -449,14 +450,13 @@ impl LogicalPlanBuilder { /// # ])) as _; /// # let table_source = Arc::new(LogicalTableSource::new(employee_schema)); /// // VALUES (1), (2) - /// let input = LogicalPlanBuilder::values(vec![vec![lit(1)], vec![lit(2)]])? - /// .build()?; + /// let input = LogicalPlanBuilder::values(vec![vec![lit(1)], vec![lit(2)]])?.build()?; /// // INSERT INTO MyTable VALUES (1), (2) /// let insert_plan = LogicalPlanBuilder::insert_into( - /// input, - /// "MyTable", - /// table_source, - /// InsertOp::Append, + /// input, + /// "MyTable", + /// table_source, + /// InsertOp::Append, /// )?; /// # Ok(()) /// # } @@ -622,11 +622,11 @@ impl LogicalPlanBuilder { } /// Make a builder for a prepare logical plan from the builder's plan - pub fn prepare(self, name: String, data_types: Vec) -> Result { + pub fn prepare(self, name: String, fields: Vec) -> Result { Ok(Self::new(LogicalPlan::Statement(Statement::Prepare( Prepare { name, - data_types, + fields, input: self.plan, }, )))) @@ -952,8 +952,8 @@ impl LogicalPlanBuilder { /// // Form the expression `(left.a != right.a)` AND `(left.b != right.b)` /// let exprs = vec![ /// col("left.a").eq(col("right.a")), - /// col("left.b").not_eq(col("right.b")) - /// ]; + /// col("left.b").not_eq(col("right.b")), + /// ]; /// /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b != b2)` /// // finding all pairs of rows from `left` and `right` where diff --git a/datafusion/expr/src/logical_plan/display.rs b/datafusion/expr/src/logical_plan/display.rs index ea08c223e8f4..b60126335598 100644 --- a/datafusion/expr/src/logical_plan/display.rs +++ b/datafusion/expr/src/logical_plan/display.rs @@ -94,17 +94,17 @@ impl<'n> TreeNodeVisitor<'n> for IndentVisitor<'_, '_> { /// `foo:Utf8;N` if `foo` is nullable. /// /// ``` -/// use arrow::datatypes::{Field, Schema, DataType}; +/// use arrow::datatypes::{DataType, Field, Schema}; /// # use datafusion_expr::logical_plan::display_schema; /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), /// Field::new("first_name", DataType::Utf8, true), -/// ]); +/// ]); /// -/// assert_eq!( -/// "[id:Int32, first_name:Utf8;N]", -/// format!("{}", display_schema(&schema)) -/// ); +/// assert_eq!( +/// "[id:Int32, first_name:Utf8;N]", +/// format!("{}", display_schema(&schema)) +/// ); /// ``` pub fn display_schema(schema: &Schema) -> impl fmt::Display + '_ { struct Wrapper<'a>(&'a Schema); diff --git a/datafusion/expr/src/logical_plan/extension.rs b/datafusion/expr/src/logical_plan/extension.rs index a8ee7885644a..fe324d40fd95 100644 --- a/datafusion/expr/src/logical_plan/extension.rs +++ b/datafusion/expr/src/logical_plan/extension.rs @@ -39,10 +39,10 @@ pub trait UserDefinedLogicalNode: fmt::Debug + Send + Sync { /// # struct Dummy { } /// /// # impl Dummy { - /// // canonical boiler plate - /// fn as_any(&self) -> &dyn Any { - /// self - /// } + /// // canonical boiler plate + /// fn as_any(&self) -> &dyn Any { + /// self + /// } /// # } /// ``` fn as_any(&self) -> &dyn Any; @@ -131,18 +131,18 @@ pub trait UserDefinedLogicalNode: fmt::Debug + Send + Sync { /// // User defined node that derives Hash /// #[derive(Hash, Debug, PartialEq, Eq)] /// struct MyNode { - /// val: u64 + /// val: u64, /// } /// /// // impl UserDefinedLogicalNode { /// // ... /// # impl MyNode { - /// // Boiler plate to call the derived Hash impl - /// fn dyn_hash(&self, state: &mut dyn std::hash::Hasher) { + /// // Boiler plate to call the derived Hash impl + /// fn dyn_hash(&self, state: &mut dyn std::hash::Hasher) { /// use std::hash::Hash; /// let mut s = state; /// self.hash(&mut s); - /// } + /// } /// // } /// # } /// ``` @@ -169,19 +169,19 @@ pub trait UserDefinedLogicalNode: fmt::Debug + Send + Sync { /// // User defined node that derives Eq /// #[derive(Hash, Debug, PartialEq, Eq)] /// struct MyNode { - /// val: u64 + /// val: u64, /// } /// /// // impl UserDefinedLogicalNode { /// // ... /// # impl MyNode { - /// // Boiler plate to call the derived Eq impl - /// fn dyn_eq(&self, other: &dyn UserDefinedLogicalNode) -> bool { + /// // Boiler plate to call the derived Eq impl + /// fn dyn_eq(&self, other: &dyn UserDefinedLogicalNode) -> bool { /// match other.as_any().downcast_ref::() { - /// Some(o) => self == o, - /// None => false, + /// Some(o) => self == o, + /// None => false, /// } - /// } + /// } /// // } /// # } /// ``` diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 05a2564464c5..0f0d81186d68 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -51,9 +51,10 @@ use crate::{ WindowFunctionDefinition, }; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaRef}; use datafusion_common::cse::{NormalizeEq, Normalizeable}; use datafusion_common::format::ExplainFormat; +use datafusion_common::metadata::check_metadata_with_storage_equal; use datafusion_common::tree_node::{ Transformed, TreeNode, TreeNodeContainer, TreeNodeRecursion, }; @@ -202,7 +203,6 @@ pub use datafusion_common::{JoinConstraint, JoinType}; /// # Ok(()) /// # } /// ``` -/// #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] pub enum LogicalPlan { /// Evaluates an arbitrary list of expressions (essentially a @@ -1098,15 +1098,13 @@ impl LogicalPlan { })) } LogicalPlan::Statement(Statement::Prepare(Prepare { - name, - data_types, - .. + name, fields, .. })) => { self.assert_no_expressions(expr)?; let input = self.only_input(inputs)?; Ok(LogicalPlan::Statement(Statement::Prepare(Prepare { name: name.clone(), - data_types: data_types.clone(), + fields: fields.clone(), input: Arc::new(input), }))) } @@ -1268,7 +1266,6 @@ impl LogicalPlan { /// \n TableScan: t1", /// plan.display_indent().to_string() /// ); - /// /// ``` pub fn with_param_values( self, @@ -1282,7 +1279,7 @@ impl LogicalPlan { if let LogicalPlan::Statement(Statement::Prepare(prepare_lp)) = plan_with_values { - param_values.verify(&prepare_lp.data_types)?; + param_values.verify_fields(&prepare_lp.fields)?; // try and take ownership of the input if is not shared, clone otherwise Arc::unwrap_or_clone(prepare_lp.input) } else { @@ -1463,8 +1460,10 @@ impl LogicalPlan { let original_name = name_preserver.save(&e); let transformed_expr = e.transform_up(|e| { if let Expr::Placeholder(Placeholder { id, .. }) = e { - let value = param_values.get_placeholders_with_values(&id)?; - Ok(Transformed::yes(Expr::Literal(value, None))) + let (value, metadata) = param_values + .get_placeholders_with_values(&id)? + .into_inner(); + Ok(Transformed::yes(Expr::Literal(value, metadata))) } else { Ok(Transformed::no(e)) } @@ -1494,24 +1493,43 @@ impl LogicalPlan { } /// Walk the logical plan, find any `Placeholder` tokens, and return a map of their IDs and DataTypes + /// + /// Note that this will drop any extension or field metadata attached to parameters. Use + /// [`LogicalPlan::get_parameter_fields`] to keep extension metadata. pub fn get_parameter_types( &self, ) -> Result>, DataFusionError> { - let mut param_types: HashMap> = HashMap::new(); + let mut parameter_fields = self.get_parameter_fields()?; + Ok(parameter_fields + .drain() + .map(|(name, maybe_field)| { + (name, maybe_field.map(|field| field.data_type().clone())) + }) + .collect()) + } + + /// Walk the logical plan, find any `Placeholder` tokens, and return a map of their IDs and FieldRefs + pub fn get_parameter_fields( + &self, + ) -> Result>, DataFusionError> { + let mut param_types: HashMap> = HashMap::new(); self.apply_with_subqueries(|plan| { plan.apply_expressions(|expr| { expr.apply(|expr| { - if let Expr::Placeholder(Placeholder { id, data_type }) = expr { + if let Expr::Placeholder(Placeholder { id, field }) = expr { let prev = param_types.get(id); - match (prev, data_type) { - (Some(Some(prev)), Some(dt)) => { - if prev != dt { - plan_err!("Conflicting types for {id}")?; - } + match (prev, field) { + (Some(Some(prev)), Some(field)) => { + check_metadata_with_storage_equal( + (field.data_type(), Some(field.metadata())), + (prev.data_type(), Some(prev.metadata())), + "parameter", + &format!(": Conflicting types for id {id}"), + )?; } - (_, Some(dt)) => { - param_types.insert(id.clone(), Some(dt.clone())); + (_, Some(field)) => { + param_types.insert(id.clone(), Some(Arc::clone(field))); } _ => { param_types.insert(id.clone(), None); @@ -1541,20 +1559,20 @@ impl LogicalPlan { /// ``` /// /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; - /// use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan}; - /// let schema = Schema::new(vec![ - /// Field::new("id", DataType::Int32, false), - /// ]); - /// let plan = table_scan(Some("t1"), &schema, None).unwrap() - /// .filter(col("id").eq(lit(5))).unwrap() - /// .build().unwrap(); + /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_expr::{col, lit, logical_plan::table_scan, LogicalPlanBuilder}; + /// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + /// let plan = table_scan(Some("t1"), &schema, None) + /// .unwrap() + /// .filter(col("id").eq(lit(5))) + /// .unwrap() + /// .build() + /// .unwrap(); /// /// // Format using display_indent /// let display_string = format!("{}", plan.display_indent()); /// - /// assert_eq!("Filter: t1.id = Int32(5)\n TableScan: t1", - /// display_string); + /// assert_eq!("Filter: t1.id = Int32(5)\n TableScan: t1", display_string); /// ``` pub fn display_indent(&self) -> impl Display + '_ { // Boilerplate structure to wrap LogicalPlan with something @@ -1583,21 +1601,24 @@ impl LogicalPlan { /// ``` /// /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; - /// use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan}; - /// let schema = Schema::new(vec![ - /// Field::new("id", DataType::Int32, false), - /// ]); - /// let plan = table_scan(Some("t1"), &schema, None).unwrap() - /// .filter(col("id").eq(lit(5))).unwrap() - /// .build().unwrap(); + /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_expr::{col, lit, logical_plan::table_scan, LogicalPlanBuilder}; + /// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + /// let plan = table_scan(Some("t1"), &schema, None) + /// .unwrap() + /// .filter(col("id").eq(lit(5))) + /// .unwrap() + /// .build() + /// .unwrap(); /// /// // Format using display_indent_schema /// let display_string = format!("{}", plan.display_indent_schema()); /// - /// assert_eq!("Filter: t1.id = Int32(5) [id:Int32]\ + /// assert_eq!( + /// "Filter: t1.id = Int32(5) [id:Int32]\ /// \n TableScan: t1 [id:Int32]", - /// display_string); + /// display_string + /// ); /// ``` pub fn display_indent_schema(&self) -> impl Display + '_ { // Boilerplate structure to wrap LogicalPlan with something @@ -1645,14 +1666,15 @@ impl LogicalPlan { /// structure, and one with additional details such as schema. /// /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; - /// use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan}; - /// let schema = Schema::new(vec![ - /// Field::new("id", DataType::Int32, false), - /// ]); - /// let plan = table_scan(Some("t1"), &schema, None).unwrap() - /// .filter(col("id").eq(lit(5))).unwrap() - /// .build().unwrap(); + /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_expr::{col, lit, logical_plan::table_scan, LogicalPlanBuilder}; + /// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + /// let plan = table_scan(Some("t1"), &schema, None) + /// .unwrap() + /// .filter(col("id").eq(lit(5))) + /// .unwrap() + /// .build() + /// .unwrap(); /// /// // Format using display_graphviz /// let graphviz_string = format!("{}", plan.display_graphviz()); @@ -1664,7 +1686,6 @@ impl LogicalPlan { /// ```bash /// dot -Tpdf < /tmp/example.dot > /tmp/example.pdf /// ``` - /// pub fn display_graphviz(&self) -> impl Display + '_ { // Boilerplate structure to wrap LogicalPlan with something // that that can be formatted @@ -1703,13 +1724,13 @@ impl LogicalPlan { /// Projection: id /// ``` /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; - /// use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan}; - /// let schema = Schema::new(vec![ - /// Field::new("id", DataType::Int32, false), - /// ]); - /// let plan = table_scan(Some("t1"), &schema, None).unwrap() - /// .build().unwrap(); + /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_expr::{col, lit, logical_plan::table_scan, LogicalPlanBuilder}; + /// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + /// let plan = table_scan(Some("t1"), &schema, None) + /// .unwrap() + /// .build() + /// .unwrap(); /// /// // Format using display /// let display_string = format!("{}", plan.display()); @@ -4231,6 +4252,7 @@ mod tests { binary_expr, col, exists, in_subquery, lit, placeholder, scalar_subquery, GroupingSet, }; + use datafusion_common::metadata::ScalarAndMetadata; use datafusion_common::tree_node::{ TransformedResult, TreeNodeRewriter, TreeNodeVisitor, }; @@ -4771,6 +4793,38 @@ mod tests { .expect_err("unexpectedly succeeded to replace an invalid placeholder"); } + #[test] + fn test_replace_placeholder_mismatched_metadata() { + let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + + // Create a prepared statement with explicit fields that do not have metadata + let plan = table_scan(TableReference::none(), &schema, None) + .unwrap() + .filter(col("id").eq(placeholder("$1"))) + .unwrap() + .build() + .unwrap(); + let prepared_builder = LogicalPlanBuilder::new(plan) + .prepare( + "".to_string(), + vec![Field::new("", DataType::Int32, true).into()], + ) + .unwrap(); + + // Attempt to bind a parameter with metadata + let mut scalar_meta = HashMap::new(); + scalar_meta.insert("some_key".to_string(), "some_value".to_string()); + let param_values = ParamValues::List(vec![ScalarAndMetadata::new( + ScalarValue::Int32(Some(42)), + Some(scalar_meta.into()), + )]); + prepared_builder + .plan() + .clone() + .with_param_values(param_values) + .expect_err("prepared field metadata mismatch unexpectedly succeeded"); + } + #[test] fn test_nullable_schema_after_grouping_set() { let schema = Schema::new(vec![ @@ -5143,7 +5197,7 @@ mod tests { .unwrap(); // Check that the placeholder parameters have not received a DataType. - let params = plan.get_parameter_types().unwrap(); + let params = plan.get_parameter_fields().unwrap(); assert_eq!(params.len(), 1); let parameter_type = params.clone().get(placeholder_value).unwrap().clone(); diff --git a/datafusion/expr/src/logical_plan/statement.rs b/datafusion/expr/src/logical_plan/statement.rs index 6d3fe9fa75ac..bfc6b53d1136 100644 --- a/datafusion/expr/src/logical_plan/statement.rs +++ b/datafusion/expr/src/logical_plan/statement.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use arrow::datatypes::DataType; +use arrow::datatypes::FieldRef; +use datafusion_common::metadata::format_type_and_metadata; use datafusion_common::{DFSchema, DFSchemaRef}; use itertools::Itertools as _; use std::fmt::{self, Display}; @@ -108,10 +109,18 @@ impl Statement { }) => { write!(f, "SetVariable: set {variable:?} to {value:?}") } - Statement::Prepare(Prepare { - name, data_types, .. - }) => { - write!(f, "Prepare: {name:?} [{}]", data_types.iter().join(", ")) + Statement::Prepare(Prepare { name, fields, .. }) => { + write!( + f, + "Prepare: {name:?} [{}]", + fields + .iter() + .map(|f| format_type_and_metadata( + f.data_type(), + Some(f.metadata()) + )) + .join(", ") + ) } Statement::Execute(Execute { name, parameters, .. @@ -192,7 +201,7 @@ pub struct Prepare { /// The name of the statement pub name: String, /// Data types of the parameters ([`Expr::Placeholder`]) - pub data_types: Vec, + pub fields: Vec, /// The logical plan of the statements pub input: Arc, } diff --git a/datafusion/expr/src/select_expr.rs b/datafusion/expr/src/select_expr.rs index 039df20f397b..bfec4c5844d0 100644 --- a/datafusion/expr/src/select_expr.rs +++ b/datafusion/expr/src/select_expr.rs @@ -44,10 +44,8 @@ use crate::{expr::WildcardOptions, Expr}; /// let wildcard = SelectExpr::Wildcard(WildcardOptions::default()); /// /// // SELECT mytable.* -/// let qualified = SelectExpr::QualifiedWildcard( -/// "mytable".into(), -/// WildcardOptions::default() -/// ); +/// let qualified = +/// SelectExpr::QualifiedWildcard("mytable".into(), WildcardOptions::default()); /// /// // SELECT col1 /// let expr = SelectExpr::Expression(col("col1").into()); diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index c4cd8c006d1f..fd54bb13a62f 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -568,7 +568,6 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// /// * `Some(ScalarUDF)` - A new instance of this function configured with the new settings /// * `None` - If this function does not change with new configuration settings (the default) - /// fn with_updated_config(&self, _config: &ConfigOptions) -> Option { None } @@ -604,10 +603,10 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// # struct Example{} /// # impl Example { /// fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { - /// // report output is only nullable if any one of the arguments are nullable - /// let nullable = args.arg_fields.iter().any(|f| f.is_nullable()); - /// let field = Arc::new(Field::new("ignored_name", DataType::Int32, true)); - /// Ok(field) + /// // report output is only nullable if any one of the arguments are nullable + /// let nullable = args.arg_fields.iter().any(|f| f.is_nullable()); + /// let field = Arc::new(Field::new("ignored_name", DataType::Int32, true)); + /// Ok(field) /// } /// # } /// ``` diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index b91db4527b3a..cd733e0a130a 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -890,7 +890,6 @@ pub fn check_all_columns_from_schema( /// all referenced column of the right side is from the right schema. /// 2. Or opposite. All referenced column of the left side is from the right schema, /// and the right side is from the left schema. -/// pub fn find_valid_equijoin_key_pair( left_key: &Expr, right_key: &Expr, @@ -936,7 +935,7 @@ pub fn generate_signature_error_msg( ) -> String { let candidate_signatures = func_signature .type_signature - .to_string_repr() + .to_string_repr_with_names(func_signature.parameter_names.as_deref()) .iter() .map(|args_str| format!("\t{func_name}({args_str})")) .collect::>() @@ -1034,10 +1033,7 @@ pub fn iter_conjunction_owned(expr: Expr) -> impl Iterator { /// let expr = col("a").eq(lit(1)).and(col("b").eq(lit(2))); /// /// // [a=1, b=2] -/// let split = vec![ -/// col("a").eq(lit(1)), -/// col("b").eq(lit(2)), -/// ]; +/// let split = vec![col("a").eq(lit(1)), col("b").eq(lit(2))]; /// /// // use split_conjunction_owned to split them /// assert_eq!(split_conjunction_owned(expr), split); @@ -1060,10 +1056,7 @@ pub fn split_conjunction_owned(expr: Expr) -> Vec { /// let expr = col("a").eq(lit(1)).add(col("b").eq(lit(2))); /// /// // [a=1, b=2] -/// let split = vec![ -/// col("a").eq(lit(1)), -/// col("b").eq(lit(2)), -/// ]; +/// let split = vec![col("a").eq(lit(1)), col("b").eq(lit(2))]; /// /// // use split_binary_owned to split them /// assert_eq!(split_binary_owned(expr, Operator::Plus), split); @@ -1131,10 +1124,7 @@ fn split_binary_impl<'a>( /// let expr = col("a").eq(lit(1)).and(col("b").eq(lit(2))); /// /// // [a=1, b=2] -/// let split = vec![ -/// col("a").eq(lit(1)), -/// col("b").eq(lit(2)), -/// ]; +/// let split = vec![col("a").eq(lit(1)), col("b").eq(lit(2))]; /// /// // use conjunction to join them together with `AND` /// assert_eq!(conjunction(split), Some(expr)); @@ -1157,10 +1147,7 @@ pub fn conjunction(filters: impl IntoIterator) -> Option { /// let expr = col("a").eq(lit(1)).or(col("b").eq(lit(2))); /// /// // [a=1, b=2] -/// let split = vec![ -/// col("a").eq(lit(1)), -/// col("b").eq(lit(2)), -/// ]; +/// let split = vec![col("a").eq(lit(1)), col("b").eq(lit(2))]; /// /// // use disjunction to join them together with `OR` /// assert_eq!(disjunction(split), Some(expr)); @@ -1295,6 +1282,7 @@ mod tests { Cast, ExprFunctionExt, WindowFunctionDefinition, }; use arrow::datatypes::{UnionFields, UnionMode}; + use datafusion_expr_common::signature::{TypeSignature, Volatility}; #[test] fn test_group_window_expr_by_sort_keys_empty_case() -> Result<()> { @@ -1714,4 +1702,52 @@ mod tests { DataType::List(Arc::new(Field::new("my_union", union_type, true))); assert!(!can_hash(&list_union_type)); } + + #[test] + fn test_generate_signature_error_msg_with_parameter_names() { + let sig = Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Utf8, DataType::Int64]), + TypeSignature::Exact(vec![ + DataType::Utf8, + DataType::Int64, + DataType::Int64, + ]), + ], + Volatility::Immutable, + ) + .with_parameter_names(vec![ + "str".to_string(), + "start_pos".to_string(), + "length".to_string(), + ]) + .expect("valid parameter names"); + + // Generate error message with only 1 argument provided + let error_msg = generate_signature_error_msg("substr", sig, &[DataType::Utf8]); + + assert!( + error_msg.contains("str: Utf8, start_pos: Int64"), + "Expected 'str: Utf8, start_pos: Int64' in error message, got: {error_msg}" + ); + assert!( + error_msg.contains("str: Utf8, start_pos: Int64, length: Int64"), + "Expected 'str: Utf8, start_pos: Int64, length: Int64' in error message, got: {error_msg}" + ); + } + + #[test] + fn test_generate_signature_error_msg_without_parameter_names() { + let sig = Signature::one_of( + vec![TypeSignature::Any(2), TypeSignature::Any(3)], + Volatility::Immutable, + ); + + let error_msg = generate_signature_error_msg("my_func", sig, &[DataType::Int32]); + + assert!( + error_msg.contains("Any, Any"), + "Expected 'Any, Any' without parameter names, got: {error_msg}" + ); + } } diff --git a/datafusion/expr/src/window_frame.rs b/datafusion/expr/src/window_frame.rs index f72dc10a6950..5fb2916c34e9 100644 --- a/datafusion/expr/src/window_frame.rs +++ b/datafusion/expr/src/window_frame.rs @@ -307,7 +307,6 @@ impl WindowFrame { /// 3. CURRENT ROW /// 4. `` FOLLOWING /// 5. UNBOUNDED FOLLOWING -/// #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] pub enum WindowFrameBound { /// 1. UNBOUNDED PRECEDING diff --git a/datafusion/functions-aggregate-common/src/utils.rs b/datafusion/functions-aggregate-common/src/utils.rs index b01f2c8629c9..7ce5f09373f5 100644 --- a/datafusion/functions-aggregate-common/src/utils.rs +++ b/datafusion/functions-aggregate-common/src/utils.rs @@ -95,6 +95,8 @@ pub struct DecimalAverager { target_mul: T::Native, /// the output precision target_precision: u8, + /// the output scale + target_scale: i8, } impl DecimalAverager { @@ -129,6 +131,7 @@ impl DecimalAverager { sum_mul, target_mul, target_precision, + target_scale, }) } else { // can't convert the lit decimal to the returned data type @@ -147,8 +150,11 @@ impl DecimalAverager { if let Ok(value) = sum.mul_checked(self.target_mul.div_wrapping(self.sum_mul)) { let new_value = value.div_wrapping(count); - let validate = - T::validate_decimal_precision(new_value, self.target_precision); + let validate = T::validate_decimal_precision( + new_value, + self.target_precision, + self.target_scale, + ); if validate.is_ok() { Ok(new_value) diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs index 28755427c732..73f2ec112ffc 100644 --- a/datafusion/functions-aggregate/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -166,7 +166,14 @@ impl AggregateUDFImpl for FirstValue { ) .into()]; fields.extend(args.ordering_fields.iter().cloned()); - fields.push(Field::new("is_set", DataType::Boolean, true).into()); + fields.push( + Field::new( + format_state_name(args.name, "first_value_is_set"), + DataType::Boolean, + true, + ) + .into(), + ); Ok(fields) } @@ -810,6 +817,8 @@ impl Accumulator for TrivialFirstValueAccumulator { // Second index contains is_set flag. if !self.is_set { let flags = states[1].as_boolean(); + validate_is_set_flags(flags, "first_value")?; + let filtered_states = filter_states_according_to_is_set(&states[0..1], flags)?; if let Some(first) = filtered_states.first() { @@ -955,6 +964,8 @@ impl Accumulator for FirstValueAccumulator { // last index contains is_set flag. let is_set_idx = states.len() - 1; let flags = states[is_set_idx].as_boolean(); + validate_is_set_flags(flags, "first_value")?; + let filtered_states = filter_states_according_to_is_set(&states[0..is_set_idx], flags)?; // 1..is_set_idx range corresponds to ordering section @@ -1087,7 +1098,14 @@ impl AggregateUDFImpl for LastValue { ) .into()]; fields.extend(args.ordering_fields.iter().cloned()); - fields.push(Field::new("is_set", DataType::Boolean, true).into()); + fields.push( + Field::new( + format_state_name(args.name, "last_value_is_set"), + DataType::Boolean, + true, + ) + .into(), + ); Ok(fields) } @@ -1285,6 +1303,8 @@ impl Accumulator for TrivialLastValueAccumulator { // LAST_VALUE(last1, last2, last3, ...) // Second index contains is_set flag. let flags = states[1].as_boolean(); + validate_is_set_flags(flags, "last_value")?; + let filtered_states = filter_states_according_to_is_set(&states[0..1], flags)?; if let Some(last) = filtered_states.last() { if !last.is_empty() { @@ -1430,6 +1450,8 @@ impl Accumulator for LastValueAccumulator { // last index contains is_set flag. let is_set_idx = states.len() - 1; let flags = states[is_set_idx].as_boolean(); + validate_is_set_flags(flags, "last_value")?; + let filtered_states = filter_states_according_to_is_set(&states[0..is_set_idx], flags)?; // 1..is_set_idx range corresponds to ordering section @@ -1473,6 +1495,16 @@ impl Accumulator for LastValueAccumulator { } } +/// Validates that `is_set flags` do not contain NULL values. +fn validate_is_set_flags(flags: &BooleanArray, function_name: &str) -> Result<()> { + if flags.null_count() > 0 { + return Err(DataFusionError::Internal(format!( + "{function_name}: is_set flags contain nulls" + ))); + } + Ok(()) +} + /// Filters states according to the `is_set` flag at the last column and returns /// the resulting states. fn filter_states_according_to_is_set( @@ -1501,7 +1533,7 @@ mod tests { use std::iter::repeat_with; use arrow::{ - array::{Int64Array, ListArray}, + array::{BooleanArray, Int64Array, ListArray, StringArray}, compute::SortOptions, datatypes::Schema, }; @@ -1914,4 +1946,90 @@ mod tests { Ok(()) } + + #[test] + fn test_first_value_merge_with_is_set_nulls() -> Result<()> { + // Test data with corrupted is_set flag + let value = Arc::new(StringArray::from(vec![Some("first_string")])) as ArrayRef; + let corrupted_flag = Arc::new(BooleanArray::from(vec![None])) as ArrayRef; + + // Test TrivialFirstValueAccumulator + let mut trivial_accumulator = + TrivialFirstValueAccumulator::try_new(&DataType::Utf8, false)?; + let trivial_states = vec![Arc::clone(&value), Arc::clone(&corrupted_flag)]; + let result = trivial_accumulator.merge_batch(&trivial_states); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("is_set flags contain nulls")); + + // Test FirstValueAccumulator (with ordering) + let schema = Schema::new(vec![Field::new("ordering", DataType::Int64, false)]); + let ordering_expr = col("ordering", &schema)?; + let mut ordered_accumulator = FirstValueAccumulator::try_new( + &DataType::Utf8, + &[DataType::Int64], + LexOrdering::new(vec![PhysicalSortExpr { + expr: ordering_expr, + options: SortOptions::default(), + }]) + .unwrap(), + false, + false, + )?; + let ordering = Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef; + let ordered_states = vec![value, ordering, corrupted_flag]; + let result = ordered_accumulator.merge_batch(&ordered_states); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("is_set flags contain nulls")); + + Ok(()) + } + + #[test] + fn test_last_value_merge_with_is_set_nulls() -> Result<()> { + // Test data with corrupted is_set flag + let value = Arc::new(StringArray::from(vec![Some("last_string")])) as ArrayRef; + let corrupted_flag = Arc::new(BooleanArray::from(vec![None])) as ArrayRef; + + // Test TrivialLastValueAccumulator + let mut trivial_accumulator = + TrivialLastValueAccumulator::try_new(&DataType::Utf8, false)?; + let trivial_states = vec![Arc::clone(&value), Arc::clone(&corrupted_flag)]; + let result = trivial_accumulator.merge_batch(&trivial_states); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("is_set flags contain nulls")); + + // Test LastValueAccumulator (with ordering) + let schema = Schema::new(vec![Field::new("ordering", DataType::Int64, false)]); + let ordering_expr = col("ordering", &schema)?; + let mut ordered_accumulator = LastValueAccumulator::try_new( + &DataType::Utf8, + &[DataType::Int64], + LexOrdering::new(vec![PhysicalSortExpr { + expr: ordering_expr, + options: SortOptions::default(), + }]) + .unwrap(), + false, + false, + )?; + let ordering = Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef; + let ordered_states = vec![value, ordering, corrupted_flag]; + let result = ordered_accumulator.merge_batch(&ordered_states); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("is_set flags contain nulls")); + + Ok(()) + } } diff --git a/datafusion/functions-nested/src/macros.rs b/datafusion/functions-nested/src/macros.rs index cec7f2fd562d..5380f6b1272d 100644 --- a/datafusion/functions-nested/src/macros.rs +++ b/datafusion/functions-nested/src/macros.rs @@ -41,10 +41,15 @@ /// * `arg`: 0 or more named arguments for the function /// * `DOC`: documentation string for the function /// * `SCALAR_UDF_FUNC`: name of the function to create (just) the `ScalarUDF` +/// * (optional) `$CTOR`: Pass a custom constructor. When omitted it +/// automatically resolves to `$UDF::new()`. /// /// [`ScalarUDFImpl`]: datafusion_expr::ScalarUDFImpl macro_rules! make_udf_expr_and_func { - ($UDF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr , $SCALAR_UDF_FN:ident) => { + ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $SCALAR_UDF_FN:ident) => { + make_udf_expr_and_func!($UDF, $EXPR_FN, $($arg)*, $DOC, $SCALAR_UDF_FN, $UDF::new); + }; + ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $SCALAR_UDF_FN:ident, $CTOR:path) => { paste::paste! { // "fluent expr_fn" style function #[doc = $DOC] @@ -54,10 +59,13 @@ macro_rules! make_udf_expr_and_func { vec![$($arg),*], )) } - create_func!($UDF, $SCALAR_UDF_FN); + create_func!($UDF, $SCALAR_UDF_FN, $CTOR); } }; - ($UDF:ty, $EXPR_FN:ident, $DOC:expr , $SCALAR_UDF_FN:ident) => { + ($UDF:ident, $EXPR_FN:ident, $DOC:expr, $SCALAR_UDF_FN:ident) => { + make_udf_expr_and_func!($UDF, $EXPR_FN, $DOC, $SCALAR_UDF_FN, $UDF::new); + }; + ($UDF:ident, $EXPR_FN:ident, $DOC:expr, $SCALAR_UDF_FN:ident, $CTOR:path) => { paste::paste! { // "fluent expr_fn" style function #[doc = $DOC] @@ -67,7 +75,7 @@ macro_rules! make_udf_expr_and_func { arg, )) } - create_func!($UDF, $SCALAR_UDF_FN); + create_func!($UDF, $SCALAR_UDF_FN, $CTOR); } }; } @@ -80,10 +88,15 @@ macro_rules! make_udf_expr_and_func { /// # Arguments /// * `UDF`: name of the [`ScalarUDFImpl`] /// * `SCALAR_UDF_FUNC`: name of the function to create (just) the `ScalarUDF` +/// * (optional) `$CTOR`: Pass a custom constructor. When omitted it +/// automatically resolves to `$UDF::new()`. /// /// [`ScalarUDFImpl`]: datafusion_expr::ScalarUDFImpl macro_rules! create_func { - ($UDF:ty, $SCALAR_UDF_FN:ident) => { + ($UDF:ident, $SCALAR_UDF_FN:ident) => { + create_func!($UDF, $SCALAR_UDF_FN, $UDF::new); + }; + ($UDF:ident, $SCALAR_UDF_FN:ident, $CTOR:path) => { paste::paste! { #[doc = concat!("ScalarFunction that returns a [`ScalarUDF`](datafusion_expr::ScalarUDF) for ")] #[doc = stringify!($UDF)] @@ -92,7 +105,7 @@ macro_rules! create_func { static INSTANCE: std::sync::LazyLock> = std::sync::LazyLock::new(|| { std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( - <$UDF>::new(), + $CTOR(), )) }); std::sync::Arc::clone(&INSTANCE) diff --git a/datafusion/functions-nested/src/range.rs b/datafusion/functions-nested/src/range.rs index 619b0e84c19a..01c6e9c43f2e 100644 --- a/datafusion/functions-nested/src/range.rs +++ b/datafusion/functions-nested/src/range.rs @@ -22,20 +22,23 @@ use arrow::array::{ builder::{Date32Builder, TimestampNanosecondBuilder}, temporal_conversions::as_datetime_with_timezone, timezone::Tz, - types::{Date32Type, IntervalMonthDayNanoType, TimestampNanosecondType as TSNT}, - Array, ArrayRef, Int64Array, ListArray, ListBuilder, NullArray, NullBufferBuilder, - TimestampNanosecondArray, + types::{Date32Type, IntervalMonthDayNanoType, TimestampNanosecondType}, + Array, ArrayRef, Int64Array, ListArray, ListBuilder, NullBufferBuilder, }; use arrow::buffer::OffsetBuffer; use arrow::datatypes::{ DataType, DataType::*, Field, IntervalUnit::MonthDayNano, TimeUnit::Nanosecond, }; -use datafusion_common::cast::{ - as_date32_array, as_int64_array, as_interval_mdn_array, as_timestamp_nanosecond_array, +use datafusion_common::{ + cast::{ + as_date32_array, as_int64_array, as_interval_mdn_array, + as_timestamp_nanosecond_array, + }, + DataFusionError, ScalarValue, }; use datafusion_common::{ - exec_datafusion_err, exec_err, internal_err, not_impl_datafusion_err, - utils::take_function_args, Result, + exec_datafusion_err, exec_err, not_impl_datafusion_err, utils::take_function_args, + Result, }; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, @@ -53,13 +56,24 @@ make_udf_expr_and_func!( range, start stop step, "create a list of values in the range between start and stop", - range_udf + range_udf, + Range::new +); + +make_udf_expr_and_func!( + GenSeries, + gen_series, + start stop step, + "create a list of values in the range between start and stop, include upper bound", + gen_series_udf, + Range::generate_series ); #[user_doc( doc_section(label = "Array Functions"), description = "Returns an Arrow array between start and stop with step. The range start..end contains all values with start <= x < end. It is empty if start >= end. Step cannot be 0.", - syntax_example = "range(start, stop, step)", + syntax_example = "range(stop) +range(start, stop[, step])", sql_example = r#"```sql > select range(2, 10, 3); +-----------------------------------+ @@ -69,11 +83,11 @@ make_udf_expr_and_func!( +-----------------------------------+ > select range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH); -+--------------------------------------------------------------+ -| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH) | -+--------------------------------------------------------------+ ++--------------------------------------------------------------------------+ +| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH) | ++--------------------------------------------------------------------------+ | [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] | -+--------------------------------------------------------------+ ++--------------------------------------------------------------------------+ ```"#, argument( name = "start", @@ -88,115 +102,13 @@ make_udf_expr_and_func!( description = "Increase by step (cannot be 0). Steps less than a day are supported only for timestamp ranges." ) )] -#[derive(Debug, PartialEq, Eq, Hash)] -pub struct Range { - signature: Signature, - aliases: Vec, -} - -impl Default for Range { - fn default() -> Self { - Self::new() - } -} -impl Range { - pub fn new() -> Self { - Self { - signature: Signature::user_defined(Volatility::Immutable), - aliases: vec![], - } - } -} -impl ScalarUDFImpl for Range { - fn as_any(&self) -> &dyn Any { - self - } - fn name(&self) -> &str { - "range" - } - - fn signature(&self) -> &Signature { - &self.signature - } - - fn coerce_types(&self, arg_types: &[DataType]) -> Result> { - arg_types - .iter() - .map(|arg_type| match arg_type { - Null => Ok(Null), - Int8 => Ok(Int64), - Int16 => Ok(Int64), - Int32 => Ok(Int64), - Int64 => Ok(Int64), - UInt8 => Ok(Int64), - UInt16 => Ok(Int64), - UInt32 => Ok(Int64), - UInt64 => Ok(Int64), - Timestamp(_, tz) => Ok(Timestamp(Nanosecond, tz.clone())), - Date32 => Ok(Date32), - Date64 => Ok(Date32), - Utf8 => Ok(Date32), - LargeUtf8 => Ok(Date32), - Utf8View => Ok(Date32), - Interval(_) => Ok(Interval(MonthDayNano)), - _ => exec_err!("Unsupported DataType"), - }) - .try_collect() - } - - fn return_type(&self, arg_types: &[DataType]) -> Result { - if arg_types.iter().any(|t| t.is_null()) { - Ok(Null) - } else { - Ok(List(Arc::new(Field::new_list_field( - arg_types[0].clone(), - true, - )))) - } - } - - fn invoke_with_args( - &self, - args: datafusion_expr::ScalarFunctionArgs, - ) -> Result { - let args = &args.args; - - if args.iter().any(|arg| arg.data_type().is_null()) { - return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1)))); - } - match args[0].data_type() { - Int64 => make_scalar_function(|args| gen_range_inner(args, false))(args), - Date32 => make_scalar_function(|args| gen_range_date(args, false))(args), - Timestamp(_, _) => { - make_scalar_function(|args| gen_range_timestamp(args, false))(args) - } - dt => { - exec_err!("unsupported type for RANGE. Expected Int64, Date32 or Timestamp, got: {dt}") - } - } - } - - fn aliases(&self) -> &[String] { - &self.aliases - } - - fn documentation(&self) -> Option<&Documentation> { - self.doc() - } -} - -make_udf_expr_and_func!( - GenSeries, - gen_series, - start stop step, - "create a list of values in the range between start and stop, include upper bound", - gen_series_udf -); +struct RangeDoc {} #[user_doc( doc_section(label = "Array Functions"), description = "Similar to the range function, but it includes the upper bound.", - syntax_example = "generate_series(start, stop, step)", + syntax_example = "generate_series(stop) +generate_series(start, stop[, step])", sql_example = r#"```sql > select generate_series(1,3); +------------------------------------+ @@ -218,25 +130,50 @@ make_udf_expr_and_func!( description = "Increase by step (can not be 0). Steps less than a day are supported only for timestamp ranges." ) )] +struct GenerateSeriesDoc {} + #[derive(Debug, PartialEq, Eq, Hash)] -pub(super) struct GenSeries { +pub struct Range { signature: Signature, - aliases: Vec, + /// `false` for range, `true` for generate_series + include_upper_bound: bool, +} + +impl Default for Range { + fn default() -> Self { + Self::new() + } } -impl GenSeries { + +impl Range { + /// Generate `range()` function which excludes upper bound. pub fn new() -> Self { Self { signature: Signature::user_defined(Volatility::Immutable), - aliases: vec![], + include_upper_bound: false, + } + } + + /// Generate `generate_series()` function which includes upper bound. + fn generate_series() -> Self { + Self { + signature: Signature::user_defined(Volatility::Immutable), + include_upper_bound: true, } } } -impl ScalarUDFImpl for GenSeries { + +impl ScalarUDFImpl for Range { fn as_any(&self) -> &dyn Any { self } + fn name(&self) -> &str { - "generate_series" + if self.include_upper_bound { + "generate_series" + } else { + "range" + } } fn signature(&self) -> &Signature { @@ -286,107 +223,263 @@ impl ScalarUDFImpl for GenSeries { let args = &args.args; if args.iter().any(|arg| arg.data_type().is_null()) { - return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1)))); + return Ok(ColumnarValue::Scalar(ScalarValue::Null)); } match args[0].data_type() { - Int64 => make_scalar_function(|args| gen_range_inner(args, true))(args), - Date32 => make_scalar_function(|args| gen_range_date(args, true))(args), + Int64 => make_scalar_function(|args| self.gen_range_inner(args))(args), + Date32 => make_scalar_function(|args| self.gen_range_date(args))(args), Timestamp(_, _) => { - make_scalar_function(|args| gen_range_timestamp(args, true))(args) + make_scalar_function(|args| self.gen_range_timestamp(args))(args) } dt => { - exec_err!( - "unsupported type for GENERATE_SERIES. Expected Int64, Date32 or Timestamp, got: {}", - dt - ) + exec_err!("unsupported type for {}. Expected Int64, Date32 or Timestamp, got: {dt}", self.name()) } } } - fn aliases(&self) -> &[String] { - &self.aliases - } - fn documentation(&self) -> Option<&Documentation> { - self.doc() + if self.include_upper_bound { + GenerateSeriesDoc {}.doc() + } else { + RangeDoc {}.doc() + } } } -/// Generates an array of integers from start to stop with a given step. -/// -/// This function takes 1 to 3 ArrayRefs as arguments, representing start, stop, and step values. -/// It returns a `Result` representing the resulting ListArray after the operation. -/// -/// # Arguments -/// -/// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and step(step value can not be zero.) values. -/// -/// # Examples -/// -/// gen_range(3) => [0, 1, 2] -/// gen_range(1, 4) => [1, 2, 3] -/// gen_range(1, 7, 2) => [1, 3, 5] -pub(super) fn gen_range_inner( - args: &[ArrayRef], - include_upper: bool, -) -> Result { - let (start_array, stop_array, step_array) = match args.len() { - 1 => (None, as_int64_array(&args[0])?, None), - 2 => ( - Some(as_int64_array(&args[0])?), - as_int64_array(&args[1])?, - None, - ), - 3 => ( - Some(as_int64_array(&args[0])?), - as_int64_array(&args[1])?, - Some(as_int64_array(&args[2])?), - ), - _ => return exec_err!("gen_range expects 1 to 3 arguments"), - }; - - let mut values = vec![]; - let mut offsets = vec![0]; - let mut valid = NullBufferBuilder::new(stop_array.len()); - for (idx, stop) in stop_array.iter().enumerate() { - match retrieve_range_args(start_array, stop, step_array, idx) { - Some((_, _, 0)) => { - return exec_err!( - "step can't be 0 for function {}(start [, stop, step])", - if include_upper { - "generate_series" - } else { - "range" - } - ); +impl Range { + /// Generates an array of integers from start to stop with a given step. + /// + /// This function takes 1 to 3 ArrayRefs as arguments, representing start, stop, and step values. + /// It returns a `Result` representing the resulting ListArray after the operation. + /// + /// # Arguments + /// + /// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and step(step value can not be zero.) values. + /// + /// # Examples + /// + /// gen_range(3) => [0, 1, 2] + /// gen_range(1, 4) => [1, 2, 3] + /// gen_range(1, 7, 2) => [1, 3, 5] + fn gen_range_inner(&self, args: &[ArrayRef]) -> Result { + let (start_array, stop_array, step_array) = match args { + [stop_array] => (None, as_int64_array(stop_array)?, None), + [start_array, stop_array] => ( + Some(as_int64_array(start_array)?), + as_int64_array(stop_array)?, + None, + ), + [start_array, stop_array, step_array] => ( + Some(as_int64_array(start_array)?), + as_int64_array(stop_array)?, + Some(as_int64_array(step_array)?), + ), + _ => return exec_err!("{} expects 1 to 3 arguments", self.name()), + }; + + let mut values = vec![]; + let mut offsets = vec![0]; + let mut valid = NullBufferBuilder::new(stop_array.len()); + for (idx, stop) in stop_array.iter().enumerate() { + match retrieve_range_args(start_array, stop, step_array, idx) { + Some((_, _, 0)) => { + return exec_err!( + "step can't be 0 for function {}(start [, stop, step])", + self.name() + ); + } + Some((start, stop, step)) => { + // Below, we utilize `usize` to represent steps. + // On 32-bit targets, the absolute value of `i64` may fail to fit into `usize`. + let step_abs = + usize::try_from(step.unsigned_abs()).map_err(|_| { + not_impl_datafusion_err!("step {} can't fit into usize", step) + })?; + values.extend( + gen_range_iter(start, stop, step < 0, self.include_upper_bound) + .step_by(step_abs), + ); + offsets.push(values.len() as i32); + valid.append_non_null(); + } + // If any of the arguments is NULL, append a NULL value to the result. + None => { + offsets.push(values.len() as i32); + valid.append_null(); + } + }; + } + let arr = Arc::new(ListArray::try_new( + Arc::new(Field::new_list_field(Int64, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(Int64Array::from(values)), + valid.finish(), + )?); + Ok(arr) + } + + fn gen_range_date(&self, args: &[ArrayRef]) -> Result { + let [start, stop, step] = take_function_args(self.name(), args)?; + + let (start_array, stop_array, step_array) = ( + as_date32_array(start)?, + as_date32_array(stop)?, + as_interval_mdn_array(step)?, + ); + + // values are date32s + let values_builder = Date32Builder::new(); + let mut list_builder = ListBuilder::new(values_builder); + + for idx in 0..stop_array.len() { + if start_array.is_null(idx) + || stop_array.is_null(idx) + || step_array.is_null(idx) + { + list_builder.append_null(); + continue; } - Some((start, stop, step)) => { - // Below, we utilize `usize` to represent steps. - // On 32-bit targets, the absolute value of `i64` may fail to fit into `usize`. - let step_abs = usize::try_from(step.unsigned_abs()).map_err(|_| { - not_impl_datafusion_err!("step {} can't fit into usize", step) - })?; - values.extend( - gen_range_iter(start, stop, step < 0, include_upper) - .step_by(step_abs), - ); - offsets.push(values.len() as i32); - valid.append_non_null(); + + let start = start_array.value(idx); + let stop = stop_array.value(idx); + let step = step_array.value(idx); + + let (months, days, _) = IntervalMonthDayNanoType::to_parts(step); + if months == 0 && days == 0 { + return exec_err!("Cannot generate date range less than 1 day."); + } + + let stop = if !self.include_upper_bound { + Date32Type::subtract_month_day_nano(stop, step) + } else { + stop + }; + + let neg = months < 0 || days < 0; + let mut new_date = start; + + let values = from_fn(|| { + if (neg && new_date < stop) || (!neg && new_date > stop) { + None + } else { + let current_date = new_date; + new_date = Date32Type::add_month_day_nano(new_date, step); + Some(Some(current_date)) + } + }); + + list_builder.append_value(values); + } + + let arr = Arc::new(list_builder.finish()); + + Ok(arr) + } + + fn gen_range_timestamp(&self, args: &[ArrayRef]) -> Result { + let [start, stop, step] = take_function_args(self.name(), args)?; + + // coerce_types fn should coerce all types to Timestamp(Nanosecond, tz) + // TODO: remove these map_err once the signature is robust enough to guard against this + let start_arr = as_timestamp_nanosecond_array(start).map_err(|_e| { + DataFusionError::Internal(format!( + "Unexpected argument type for {} : {}", + self.name(), + start.data_type() + )) + })?; + let stop_arr = as_timestamp_nanosecond_array(stop).map_err(|_e| { + DataFusionError::Internal(format!( + "Unexpected argument type for {} : {}", + self.name(), + stop.data_type() + )) + })?; + let step_arr = as_interval_mdn_array(step)?; + let start_tz = parse_tz(&start_arr.timezone())?; + let stop_tz = parse_tz(&stop_arr.timezone())?; + + // values are timestamps + let values_builder = start_arr + .timezone() + .map_or_else(TimestampNanosecondBuilder::new, |start_tz_str| { + TimestampNanosecondBuilder::new().with_timezone(start_tz_str) + }); + let mut list_builder = ListBuilder::new(values_builder); + + for idx in 0..start_arr.len() { + if start_arr.is_null(idx) || stop_arr.is_null(idx) || step_arr.is_null(idx) { + list_builder.append_null(); + continue; } - // If any of the arguments is NULL, append a NULL value to the result. - None => { - offsets.push(values.len() as i32); - valid.append_null(); + + let start = start_arr.value(idx); + let stop = stop_arr.value(idx); + let step = step_arr.value(idx); + + let (months, days, ns) = IntervalMonthDayNanoType::to_parts(step); + if months == 0 && days == 0 && ns == 0 { + return exec_err!("Interval argument to {} must not be 0", self.name()); } - }; + + let neg = TimestampNanosecondType::add_month_day_nano(start, step, start_tz) + .ok_or(exec_datafusion_err!( + "Cannot generate timestamp range where start + step overflows" + ))? + .cmp(&start) + == Ordering::Less; + + let stop_dt = + as_datetime_with_timezone::(stop, stop_tz) + .ok_or(exec_datafusion_err!( + "Cannot generate timestamp for stop: {}: {:?}", + stop, + stop_tz + ))?; + + let mut current = start; + let mut current_dt = + as_datetime_with_timezone::(current, start_tz) + .ok_or(exec_datafusion_err!( + "Cannot generate timestamp for start: {}: {:?}", + current, + start_tz + ))?; + + let values = from_fn(|| { + let generate_series_should_end = self.include_upper_bound + && ((neg && current_dt < stop_dt) || (!neg && current_dt > stop_dt)); + let range_should_end = !self.include_upper_bound + && ((neg && current_dt <= stop_dt) + || (!neg && current_dt >= stop_dt)); + if generate_series_should_end || range_should_end { + return None; + } + + let prev_current = current; + + if let Some(ts) = + TimestampNanosecondType::add_month_day_nano(current, step, start_tz) + { + current = ts; + current_dt = as_datetime_with_timezone::( + current, start_tz, + )?; + + Some(Some(prev_current)) + } else { + // we failed to parse the timestamp here so terminate the series + None + } + }); + + list_builder.append_value(values); + } + + let arr = Arc::new(list_builder.finish()); + + Ok(arr) } - let arr = Arc::new(ListArray::try_new( - Arc::new(Field::new_list_field(Int64, true)), - OffsetBuffer::new(offsets.into()), - Arc::new(Int64Array::from(values)), - valid.finish(), - )?); - Ok(arr) } /// Get the (start, stop, step) args for the range and generate_series function. @@ -436,201 +529,7 @@ fn gen_range_iter( } } -fn gen_range_date(args: &[ArrayRef], include_upper_bound: bool) -> Result { - let [start, stop, step] = take_function_args("range", args)?; - - let (start_array, stop_array, step_array) = ( - Some(as_date32_array(start)?), - as_date32_array(stop)?, - Some(as_interval_mdn_array(step)?), - ); - - // values are date32s - let values_builder = Date32Builder::new(); - let mut list_builder = ListBuilder::new(values_builder); - - for idx in 0..stop_array.len() { - if stop_array.is_null(idx) { - list_builder.append_null(); - continue; - } - let mut stop = stop_array.value(idx); - - let start = if let Some(start_array_values) = start_array { - if start_array_values.is_null(idx) { - list_builder.append_null(); - continue; - } - start_array_values.value(idx) - } else { - list_builder.append_null(); - continue; - }; - - let step = if let Some(step) = step_array { - if step.is_null(idx) { - list_builder.append_null(); - continue; - } - step.value(idx) - } else { - list_builder.append_null(); - continue; - }; - - let (months, days, _) = IntervalMonthDayNanoType::to_parts(step); - - if months == 0 && days == 0 { - return exec_err!("Cannot generate date range less than 1 day."); - } - - let neg = months < 0 || days < 0; - if !include_upper_bound { - stop = Date32Type::subtract_month_day_nano(stop, step); - } - let mut new_date = start; - - let values = from_fn(|| { - if (neg && new_date < stop) || (!neg && new_date > stop) { - None - } else { - let current_date = new_date; - new_date = Date32Type::add_month_day_nano(new_date, step); - Some(Some(current_date)) - } - }); - - list_builder.append_value(values); - } - - let arr = Arc::new(list_builder.finish()); - - Ok(arr) -} - -fn gen_range_timestamp(args: &[ArrayRef], include_upper_bound: bool) -> Result { - let func_name = if include_upper_bound { - "GENERATE_SERIES" - } else { - "RANGE" - }; - let [start, stop, step] = take_function_args(func_name, args)?; - - // coerce_types fn should coerce all types to Timestamp(Nanosecond, tz) - let (start_arr, start_tz_opt) = cast_timestamp_arg(start, include_upper_bound)?; - let (stop_arr, stop_tz_opt) = cast_timestamp_arg(stop, include_upper_bound)?; - let step_arr = as_interval_mdn_array(step)?; - let start_tz = parse_tz(start_tz_opt)?; - let stop_tz = parse_tz(stop_tz_opt)?; - - // values are timestamps - let values_builder = start_tz_opt - .clone() - .map_or_else(TimestampNanosecondBuilder::new, |start_tz_str| { - TimestampNanosecondBuilder::new().with_timezone(start_tz_str) - }); - let mut list_builder = ListBuilder::new(values_builder); - - for idx in 0..start_arr.len() { - if start_arr.is_null(idx) || stop_arr.is_null(idx) || step_arr.is_null(idx) { - list_builder.append_null(); - continue; - } - - let start = start_arr.value(idx); - let stop = stop_arr.value(idx); - let step = step_arr.value(idx); - - let (months, days, ns) = IntervalMonthDayNanoType::to_parts(step); - if months == 0 && days == 0 && ns == 0 { - return exec_err!( - "Interval argument to {} must not be 0", - if include_upper_bound { - "GENERATE_SERIES" - } else { - "RANGE" - } - ); - } - - let neg = TSNT::add_month_day_nano(start, step, start_tz) - .ok_or(exec_datafusion_err!( - "Cannot generate timestamp range where start + step overflows" - ))? - .cmp(&start) - == Ordering::Less; - - let stop_dt = as_datetime_with_timezone::(stop, stop_tz).ok_or( - exec_datafusion_err!( - "Cannot generate timestamp for stop: {}: {:?}", - stop, - stop_tz - ), - )?; - - let mut current = start; - let mut current_dt = as_datetime_with_timezone::(current, start_tz).ok_or( - exec_datafusion_err!( - "Cannot generate timestamp for start: {}: {:?}", - current, - start_tz - ), - )?; - - let values = from_fn(|| { - if (include_upper_bound - && ((neg && current_dt < stop_dt) || (!neg && current_dt > stop_dt))) - || (!include_upper_bound - && ((neg && current_dt <= stop_dt) - || (!neg && current_dt >= stop_dt))) - { - return None; - } - - let prev_current = current; - - if let Some(ts) = TSNT::add_month_day_nano(current, step, start_tz) { - current = ts; - current_dt = as_datetime_with_timezone::(current, start_tz)?; - - Some(Some(prev_current)) - } else { - // we failed to parse the timestamp here so terminate the series - None - } - }); - - list_builder.append_value(values); - } - - let arr = Arc::new(list_builder.finish()); - - Ok(arr) -} - -fn cast_timestamp_arg( - arg: &ArrayRef, - include_upper: bool, -) -> Result<(&TimestampNanosecondArray, &Option>)> { - match arg.data_type() { - Timestamp(Nanosecond, tz_opt) => { - Ok((as_timestamp_nanosecond_array(arg)?, tz_opt)) - } - _ => { - internal_err!( - "Unexpected argument type for {} : {}", - if include_upper { - "GENERATE_SERIES" - } else { - "RANGE" - }, - arg.data_type() - ) - } - } -} - -fn parse_tz(tz: &Option>) -> Result { +fn parse_tz(tz: &Option<&str>) -> Result { let tz = tz.as_ref().map_or_else(|| "+00", |s| s); Tz::from_str(tz) diff --git a/datafusion/functions-nested/src/replace.rs b/datafusion/functions-nested/src/replace.rs index 59f851a776a1..4314d41419bc 100644 --- a/datafusion/functions-nested/src/replace.rs +++ b/datafusion/functions-nested/src/replace.rs @@ -105,6 +105,7 @@ impl ArrayReplace { }, ), volatility: Volatility::Immutable, + parameter_names: None, }, aliases: vec![String::from("list_replace")], } @@ -186,6 +187,7 @@ impl ArrayReplaceN { }, ), volatility: Volatility::Immutable, + parameter_names: None, }, aliases: vec![String::from("list_replace_n")], } @@ -265,6 +267,7 @@ impl ArrayReplaceAll { }, ), volatility: Volatility::Immutable, + parameter_names: None, }, aliases: vec![String::from("list_replace_all")], } diff --git a/datafusion/functions-table/src/generate_series.rs b/datafusion/functions-table/src/generate_series.rs index d00f3d734d76..c66e652147eb 100644 --- a/datafusion/functions-table/src/generate_series.rs +++ b/datafusion/functions-table/src/generate_series.rs @@ -237,6 +237,7 @@ impl GenerateSeriesTable { pub fn as_generator( &self, batch_size: usize, + projection: Option>, ) -> Result>> { let generator: Arc> = match &self.args { GenSeriesArgs::ContainsNull { name } => Arc::new(RwLock::new(Empty { name })), @@ -255,6 +256,7 @@ impl GenerateSeriesTable { batch_size, include_end: *include_end, name, + projection, })), GenSeriesArgs::TimestampArgs { start, @@ -295,6 +297,7 @@ impl GenerateSeriesTable { batch_size, include_end: *include_end, name, + projection, })) } GenSeriesArgs::DateArgs { @@ -324,6 +327,7 @@ impl GenerateSeriesTable { batch_size, include_end: *include_end, name, + projection, })), }; @@ -341,6 +345,7 @@ pub struct GenericSeriesState { current: T, include_end: bool, name: &'static str, + projection: Option>, } impl GenericSeriesState { @@ -396,7 +401,11 @@ impl LazyBatchGenerator for GenericSeriesState { let array = self.current.create_array(buf)?; let batch = RecordBatch::try_new(Arc::clone(&self.schema), vec![array])?; - Ok(Some(batch)) + let projected = match self.projection.as_ref() { + Some(projection) => batch.project(projection)?, + None => batch, + }; + Ok(Some(projected)) } } @@ -477,7 +486,7 @@ impl TableProvider for GenerateSeriesTable { None => self.schema(), }; - let generator = self.as_generator(batch_size)?; + let generator = self.as_generator(batch_size, projection.cloned())?; Ok(Arc::new(LazyMemoryExec::try_new(schema, vec![generator])?)) } diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 90331fbccaf0..1dbeee7159fd 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -78,6 +78,7 @@ hex = { version = "0.4", optional = true } itertools = { workspace = true } log = { workspace = true } md-5 = { version = "^0.10.0", optional = true } +num-traits = { workspace = true } rand = { workspace = true } regex = { workspace = true, optional = true } sha2 = { version = "^0.10.9", optional = true } diff --git a/datafusion/functions/src/core/arrow_cast.rs b/datafusion/functions/src/core/arrow_cast.rs index 94a41ba4bb25..c4e58601cd10 100644 --- a/datafusion/functions/src/core/arrow_cast.rs +++ b/datafusion/functions/src/core/arrow_cast.rs @@ -60,16 +60,26 @@ use datafusion_macros::user_doc; description = "Casts a value to a specific Arrow data type.", syntax_example = "arrow_cast(expression, datatype)", sql_example = r#"```sql -> select arrow_cast(-5, 'Int8') as a, +> select + arrow_cast(-5, 'Int8') as a, arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b, - arrow_cast('bar', 'LargeUtf8') as c, - arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d - ; -+----+-----+-----+---------------------------+ -| a | b | c | d | -+----+-----+-----+---------------------------+ -| -5 | foo | bar | 2023-01-02T12:53:02+08:00 | -+----+-----+-----+---------------------------+ + arrow_cast('bar', 'LargeUtf8') as c; + ++----+-----+-----+ +| a | b | c | ++----+-----+-----+ +| -5 | foo | bar | ++----+-----+-----+ + +> select + arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs, "+08:00")') as d, + arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs)') as e; + ++---------------------------+---------------------+ +| d | e | ++---------------------------+---------------------+ +| 2023-01-02T12:53:02+08:00 | 2023-01-02T12:53:02 | ++---------------------------+---------------------+ ```"#, argument( name = "expression", diff --git a/datafusion/functions/src/datetime/date_bin.rs b/datafusion/functions/src/datetime/date_bin.rs index 74e286de0f58..c4e89743bd55 100644 --- a/datafusion/functions/src/datetime/date_bin.rs +++ b/datafusion/functions/src/datetime/date_bin.rs @@ -687,7 +687,7 @@ mod tests { let res = invoke_date_bin_with_args(args, 1, return_field); assert_eq!( res.err().unwrap().strip_backtrace(), - "Execution error: DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision but got Timestamp(Microsecond, None)" + "Execution error: DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision but got Timestamp(µs)" ); args = vec![ diff --git a/datafusion/functions/src/datetime/to_local_time.rs b/datafusion/functions/src/datetime/to_local_time.rs index a2a54398a33b..6e0a150b0a35 100644 --- a/datafusion/functions/src/datetime/to_local_time.rs +++ b/datafusion/functions/src/datetime/to_local_time.rs @@ -69,11 +69,11 @@ use datafusion_macros::user_doc; FROM ( SELECT '2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels' AS time ); -+---------------------------+------------------------------------------------+---------------------+-----------------------------+ -| time | type | to_local_time | to_local_time_type | -+---------------------------+------------------------------------------------+---------------------+-----------------------------+ -| 2024-04-01T00:00:20+02:00 | Timestamp(Nanosecond, Some("Europe/Brussels")) | 2024-04-01T00:00:20 | Timestamp(Nanosecond, None) | -+---------------------------+------------------------------------------------+---------------------+-----------------------------+ ++---------------------------+----------------------------------+---------------------+--------------------+ +| time | type | to_local_time | to_local_time_type | ++---------------------------+----------------------------------+---------------------+--------------------+ +| 2024-04-01T00:00:20+02:00 | Timestamp(ns, "Europe/Brussels") | 2024-04-01T00:00:20 | Timestamp(ns) | ++---------------------------+----------------------------------+---------------------+--------------------+ # combine `to_local_time()` with `date_bin()` to bin on boundaries in the timezone rather # than UTC boundaries diff --git a/datafusion/functions/src/datetime/to_timestamp.rs b/datafusion/functions/src/datetime/to_timestamp.rs index dcd52aa07be3..0a0700097770 100644 --- a/datafusion/functions/src/datetime/to_timestamp.rs +++ b/datafusion/functions/src/datetime/to_timestamp.rs @@ -38,7 +38,7 @@ use datafusion_macros::user_doc; description = r#" Converts a value to a timestamp (`YYYY-MM-DDT00:00:00Z`). Supports strings, integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats] are provided. Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp. -Note: `to_timestamp` returns `Timestamp(Nanosecond)`. The supported range for integer input is between `-9223372037` and `9223372036`. Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds. +Note: `to_timestamp` returns `Timestamp(ns)`. The supported range for integer input is between `-9223372037` and `9223372036`. Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds. "#, syntax_example = "to_timestamp(expression[, ..., format_n])", sql_example = r#"```sql diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index 5baa91936320..e5314ad220c8 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -24,7 +24,10 @@ use arrow::{ datatypes::{ByteArrayType, DataType}, }; use arrow_buffer::{Buffer, OffsetBufferBuilder}; -use base64::{engine::general_purpose, Engine as _}; +use base64::{ + engine::{DecodePaddingMode, GeneralPurpose, GeneralPurposeConfig}, + Engine as _, +}; use datafusion_common::{ cast::{as_generic_binary_array, as_generic_string_array}, not_impl_err, plan_err, @@ -40,6 +43,14 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use datafusion_macros::user_doc; use std::any::Any; +// Allow padding characters, but don't require them, and don't generate them. +const BASE64_ENGINE: GeneralPurpose = GeneralPurpose::new( + &base64::alphabet::STANDARD, + GeneralPurposeConfig::new() + .with_encode_padding(false) + .with_decode_padding_mode(DecodePaddingMode::Indifferent), +); + #[user_doc( doc_section(label = "Binary String Functions"), description = "Encode binary data into a textual representation.", @@ -302,7 +313,7 @@ fn hex_encode(input: &[u8]) -> String { } fn base64_encode(input: &[u8]) -> String { - general_purpose::STANDARD_NO_PAD.encode(input) + BASE64_ENGINE.encode(input) } fn hex_decode(input: &[u8], buf: &mut [u8]) -> Result { @@ -315,7 +326,7 @@ fn hex_decode(input: &[u8], buf: &mut [u8]) -> Result { } fn base64_decode(input: &[u8], buf: &mut [u8]) -> Result { - general_purpose::STANDARD_NO_PAD + BASE64_ENGINE .decode_slice(input, buf) .map_err(|e| internal_datafusion_err!("Failed to decode from base64: {e}")) } @@ -364,18 +375,16 @@ where impl Encoding { fn encode_scalar(self, value: Option<&[u8]>) -> ColumnarValue { ColumnarValue::Scalar(match self { - Self::Base64 => ScalarValue::Utf8( - value.map(|v| general_purpose::STANDARD_NO_PAD.encode(v)), - ), + Self::Base64 => ScalarValue::Utf8(value.map(|v| BASE64_ENGINE.encode(v))), Self::Hex => ScalarValue::Utf8(value.map(hex::encode)), }) } fn encode_large_scalar(self, value: Option<&[u8]>) -> ColumnarValue { ColumnarValue::Scalar(match self { - Self::Base64 => ScalarValue::LargeUtf8( - value.map(|v| general_purpose::STANDARD_NO_PAD.encode(v)), - ), + Self::Base64 => { + ScalarValue::LargeUtf8(value.map(|v| BASE64_ENGINE.encode(v))) + } Self::Hex => ScalarValue::LargeUtf8(value.map(hex::encode)), }) } @@ -411,15 +420,9 @@ impl Encoding { }; let out = match self { - Self::Base64 => { - general_purpose::STANDARD_NO_PAD - .decode(value) - .map_err(|e| { - internal_datafusion_err!( - "Failed to decode value using base64: {e}" - ) - })? - } + Self::Base64 => BASE64_ENGINE.decode(value).map_err(|e| { + internal_datafusion_err!("Failed to decode value using base64: {e}") + })?, Self::Hex => hex::decode(value).map_err(|e| { internal_datafusion_err!("Failed to decode value using hex: {e}") })?, @@ -435,15 +438,9 @@ impl Encoding { }; let out = match self { - Self::Base64 => { - general_purpose::STANDARD_NO_PAD - .decode(value) - .map_err(|e| { - internal_datafusion_err!( - "Failed to decode value using base64: {e}" - ) - })? - } + Self::Base64 => BASE64_ENGINE.decode(value).map_err(|e| { + internal_datafusion_err!("Failed to decode value using base64: {e}") + })?, Self::Hex => hex::decode(value).map_err(|e| { internal_datafusion_err!("Failed to decode value using hex: {e}") })?, diff --git a/datafusion/functions/src/math/abs.rs b/datafusion/functions/src/math/abs.rs index 040f13c01449..b3dc2b2eb6f8 100644 --- a/datafusion/functions/src/math/abs.rs +++ b/datafusion/functions/src/math/abs.rs @@ -22,7 +22,8 @@ use std::sync::Arc; use arrow::array::{ ArrayRef, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, - Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, + Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, + Int8Array, }; use arrow::datatypes::DataType; use arrow::error::ArrowError; @@ -34,6 +35,7 @@ use datafusion_expr::{ Volatility, }; use datafusion_macros::user_doc; +use num_traits::sign::Signed; type MathArrayFunction = fn(&ArrayRef) -> Result; @@ -81,6 +83,7 @@ macro_rules! make_decimal_abs_function { /// Return different implementations based on input datatype to reduce branches during execution fn create_abs_function(input_data_type: &DataType) -> Result { match input_data_type { + DataType::Float16 => Ok(make_abs_function!(Float16Array)), DataType::Float32 => Ok(make_abs_function!(Float32Array)), DataType::Float64 => Ok(make_abs_function!(Float64Array)), @@ -143,6 +146,7 @@ impl ScalarUDFImpl for AbsFunc { fn as_any(&self) -> &dyn Any { self } + fn name(&self) -> &str { "abs" } @@ -152,35 +156,7 @@ impl ScalarUDFImpl for AbsFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - match arg_types[0] { - DataType::Float32 => Ok(DataType::Float32), - DataType::Float64 => Ok(DataType::Float64), - DataType::Int8 => Ok(DataType::Int8), - DataType::Int16 => Ok(DataType::Int16), - DataType::Int32 => Ok(DataType::Int32), - DataType::Int64 => Ok(DataType::Int64), - DataType::Null => Ok(DataType::Null), - DataType::UInt8 => Ok(DataType::UInt8), - DataType::UInt16 => Ok(DataType::UInt16), - DataType::UInt32 => Ok(DataType::UInt32), - DataType::UInt64 => Ok(DataType::UInt64), - DataType::Decimal32(precision, scale) => { - Ok(DataType::Decimal32(precision, scale)) - } - DataType::Decimal64(precision, scale) => { - Ok(DataType::Decimal64(precision, scale)) - } - DataType::Decimal128(precision, scale) => { - Ok(DataType::Decimal128(precision, scale)) - } - DataType::Decimal256(precision, scale) => { - Ok(DataType::Decimal256(precision, scale)) - } - _ => not_impl_err!( - "Unsupported data type {} for function abs", - arg_types[0].to_string() - ), - } + Ok(arg_types[0].clone()) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 0b35f664532d..46b3cc63d0b6 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -71,7 +71,13 @@ impl Default for SubstrFunc { impl SubstrFunc { pub fn new() -> Self { Self { - signature: Signature::user_defined(Volatility::Immutable), + signature: Signature::user_defined(Volatility::Immutable) + .with_parameter_names(vec![ + "str".to_string(), + "start_pos".to_string(), + "length".to_string(), + ]) + .expect("valid parameter names"), aliases: vec![String::from("substring")], } } diff --git a/datafusion/macros/Cargo.toml b/datafusion/macros/Cargo.toml index fe979720bc56..64781ddeaf42 100644 --- a/datafusion/macros/Cargo.toml +++ b/datafusion/macros/Cargo.toml @@ -43,4 +43,4 @@ proc-macro = true [dependencies] datafusion-doc = { workspace = true } quote = "1.0.41" -syn = { version = "2.0.106", features = ["full"] } +syn = { version = "2.0.108", features = ["full"] } diff --git a/datafusion/macros/src/user_doc.rs b/datafusion/macros/src/user_doc.rs index 71ce381ec431..58c2cc2b1b2a 100644 --- a/datafusion/macros/src/user_doc.rs +++ b/datafusion/macros/src/user_doc.rs @@ -61,7 +61,6 @@ use syn::{parse_macro_input, DeriveInput, LitStr}; /// } /// ``` /// will generate the following code -/// /// ```ignore /// pub struct ToDateFunc { /// signature: Signature, diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 3d5dee3a7255..4fb0f8553b4b 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -2117,7 +2117,7 @@ mod test { assert_analyzed_plan_eq!( plan, @r#" - Projection: CAST(Utf8("1998-03-18") AS Timestamp(Nanosecond, None)) = CAST(CAST(Utf8("1998-03-18") AS Date32) AS Timestamp(Nanosecond, None)) + Projection: CAST(Utf8("1998-03-18") AS Timestamp(ns)) = CAST(CAST(Utf8("1998-03-18") AS Date32) AS Timestamp(ns)) EmptyRelation: rows=0 "# ) @@ -2258,7 +2258,7 @@ mod test { let err = coerce_case_expression(case, &schema).unwrap_err(); assert_snapshot!( err.strip_backtrace(), - @"Error during planning: Failed to coerce then (Date32, Float32, Binary) and else (Timestamp(Nanosecond, None)) to common types in CASE WHEN expression" + @"Error during planning: Failed to coerce then (Date32, Float32, Binary) and else (Timestamp(ns)) to common types in CASE WHEN expression" ); Ok(()) @@ -2465,7 +2465,7 @@ mod test { assert_analyzed_plan_eq!( plan, @r#" - Projection: a = CAST(CAST(a AS Map(Field { name: "key_value", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false)) AS Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false)) + Projection: a = CAST(CAST(a AS Map("key_value": Struct("key": Utf8, "value": nullable Float64), unsorted)) AS Map("entries": Struct("key": Utf8, "value": nullable Float64), unsorted)) EmptyRelation: rows=0 "# ) @@ -2488,7 +2488,7 @@ mod test { assert_analyzed_plan_eq!( plan, @r#" - Projection: IntervalYearMonth("12") + CAST(Utf8("2000-01-01T00:00:00") AS Timestamp(Nanosecond, None)) + Projection: IntervalYearMonth("12") + CAST(Utf8("2000-01-01T00:00:00") AS Timestamp(ns)) EmptyRelation: rows=0 "# ) @@ -2513,7 +2513,7 @@ mod test { assert_analyzed_plan_eq!( plan, @r#" - Projection: CAST(Utf8("1998-03-18") AS Timestamp(Nanosecond, None)) - CAST(Utf8("1998-03-18") AS Timestamp(Nanosecond, None)) + Projection: CAST(Utf8("1998-03-18") AS Timestamp(ns)) - CAST(Utf8("1998-03-18") AS Timestamp(ns)) EmptyRelation: rows=0 "# ) diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index c8be689fc5a4..ccf90f91e68f 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -1972,14 +1972,14 @@ mod tests { assert_optimized_plan_equal!( plan, - @r#" + @r" Projection: test.b [b:UInt32] LeftSemi Join: Filter: Boolean(true) [a:UInt32, b:UInt32, c:UInt32] TableScan: test [a:UInt32, b:UInt32, c:UInt32] SubqueryAlias: __correlated_sq_1 [arr:Int32;N] Unnest: lists[sq.arr|depth=1] structs[] [arr:Int32;N] - TableScan: sq [arr:List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - "# + TableScan: sq [arr:List(Field { data_type: Int32, nullable: true });N] + " ) } @@ -2007,14 +2007,14 @@ mod tests { assert_optimized_plan_equal!( plan, - @r#" + @r" Projection: test.b [b:UInt32] LeftSemi Join: Filter: __correlated_sq_1.a = test.b [a:UInt32, b:UInt32, c:UInt32] TableScan: test [a:UInt32, b:UInt32, c:UInt32] SubqueryAlias: __correlated_sq_1 [a:UInt32;N] Unnest: lists[sq.a|depth=1] structs[] [a:UInt32;N] - TableScan: sq [a:List(Field { name: "item", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - "# + TableScan: sq [a:List(Field { data_type: UInt32, nullable: true });N] + " ) } diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index a8251d669002..1c0790b3e3ac 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -562,7 +562,6 @@ fn push_down_join( /// /// * `on_filters` filters from the join ON clause that have not already been /// identified as join predicates -/// fn infer_join_predicates( join: &Join, predicates: &[Expr], @@ -649,7 +648,6 @@ impl InferredPredicates { /// * `predicates` the pushed down predicates /// /// * `inferred_predicates` the inferred results -/// fn infer_join_predicates_from_predicates( join_col_keys: &[(&Column, &Column)], predicates: &[Expr], @@ -673,7 +671,6 @@ fn infer_join_predicates_from_predicates( /// identified as join predicates /// /// * `inferred_predicates` the inferred results -/// fn infer_join_predicates_from_on_filters( join_col_keys: &[(&Column, &Column)], join_type: JoinType, @@ -719,7 +716,6 @@ fn infer_join_predicates_from_on_filters( /// /// * `ENABLE_RIGHT_TO_LEFT` indicates that the left table related predicate can /// be inferred from the right table related predicate -/// fn infer_join_predicates_impl< const ENABLE_LEFT_TO_RIGHT: bool, const ENABLE_RIGHT_TO_LEFT: bool, diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index c5a2e6578805..80d4a2de6679 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -30,7 +30,6 @@ use datafusion_expr::logical_plan::{Join, JoinType, Limit, LogicalPlan}; use datafusion_expr::{lit, FetchType, SkipType}; /// Optimization rule that tries to push down `LIMIT`. -/// //. It will push down through projection, limits (taking the smaller limit) #[derive(Default, Debug)] pub struct PushDownLimit {} diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index c40906239073..85e9d9b6a0ed 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -29,6 +29,7 @@ use std::sync::Arc; use datafusion_common::{ cast::{as_large_list_array, as_list_array}, + metadata::FieldMetadata, tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter}, }; use datafusion_common::{ @@ -57,7 +58,6 @@ use crate::simplify_expressions::unwrap_cast::{ unwrap_cast_in_comparison_for_binary, }; use crate::simplify_expressions::SimplifyInfo; -use datafusion_expr::expr::FieldMetadata; use datafusion_expr_common::casts::try_cast_literal_to_type; use indexmap::IndexSet; use regex::Regex; @@ -69,23 +69,21 @@ use regex::Regex; /// /// For example: /// ``` -/// use arrow::datatypes::{Schema, Field, DataType}; -/// use datafusion_expr::{col, lit}; +/// use arrow::datatypes::{DataType, Field, Schema}; /// use datafusion_common::{DataFusionError, ToDFSchema}; /// use datafusion_expr::execution_props::ExecutionProps; /// use datafusion_expr::simplify::SimplifyContext; +/// use datafusion_expr::{col, lit}; /// use datafusion_optimizer::simplify_expressions::ExprSimplifier; /// /// // Create the schema -/// let schema = Schema::new(vec![ -/// Field::new("i", DataType::Int64, false), -/// ]) -/// .to_dfschema_ref().unwrap(); +/// let schema = Schema::new(vec![Field::new("i", DataType::Int64, false)]) +/// .to_dfschema_ref() +/// .unwrap(); /// /// // Create the simplifier /// let props = ExecutionProps::new(); -/// let context = SimplifyContext::new(&props) -/// .with_schema(schema); +/// let context = SimplifyContext::new(&props).with_schema(schema); /// let simplifier = ExprSimplifier::new(context); /// /// // Use the simplifier @@ -144,35 +142,35 @@ impl ExprSimplifier { /// /// ``` /// use arrow::datatypes::DataType; - /// use datafusion_expr::{col, lit, Expr}; + /// use datafusion_common::DFSchema; /// use datafusion_common::Result; /// use datafusion_expr::execution_props::ExecutionProps; /// use datafusion_expr::simplify::SimplifyContext; /// use datafusion_expr::simplify::SimplifyInfo; + /// use datafusion_expr::{col, lit, Expr}; /// use datafusion_optimizer::simplify_expressions::ExprSimplifier; - /// use datafusion_common::DFSchema; /// use std::sync::Arc; /// /// /// Simple implementation that provides `Simplifier` the information it needs /// /// See SimplifyContext for a structure that does this. /// #[derive(Default)] /// struct Info { - /// execution_props: ExecutionProps, + /// execution_props: ExecutionProps, /// }; /// /// impl SimplifyInfo for Info { - /// fn is_boolean_type(&self, expr: &Expr) -> Result { - /// Ok(false) - /// } - /// fn nullable(&self, expr: &Expr) -> Result { - /// Ok(true) - /// } - /// fn execution_props(&self) -> &ExecutionProps { - /// &self.execution_props - /// } - /// fn get_data_type(&self, expr: &Expr) -> Result { - /// Ok(DataType::Int32) - /// } + /// fn is_boolean_type(&self, expr: &Expr) -> Result { + /// Ok(false) + /// } + /// fn nullable(&self, expr: &Expr) -> Result { + /// Ok(true) + /// } + /// fn execution_props(&self) -> &ExecutionProps { + /// &self.execution_props + /// } + /// fn get_data_type(&self, expr: &Expr) -> Result { + /// Ok(DataType::Int32) + /// } /// } /// /// // Create the simplifier @@ -198,7 +196,6 @@ impl ExprSimplifier { /// optimizations. /// /// See [Self::simplify] for details and usage examples. - /// #[deprecated( since = "48.0.0", note = "Use `simplify_with_cycle_count_transformed` instead" @@ -222,7 +219,6 @@ impl ExprSimplifier { /// - The number of simplification cycles that were performed /// /// See [Self::simplify] for details and usage examples. - /// pub fn simplify_with_cycle_count_transformed( &self, mut expr: Expr, @@ -286,24 +282,24 @@ impl ExprSimplifier { /// /// ```rust /// use arrow::datatypes::{DataType, Field, Schema}; - /// use datafusion_expr::{col, lit, Expr}; - /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval}; /// use datafusion_common::{Result, ScalarValue, ToDFSchema}; /// use datafusion_expr::execution_props::ExecutionProps; + /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval}; /// use datafusion_expr::simplify::SimplifyContext; + /// use datafusion_expr::{col, lit, Expr}; /// use datafusion_optimizer::simplify_expressions::ExprSimplifier; /// /// let schema = Schema::new(vec![ - /// Field::new("x", DataType::Int64, false), - /// Field::new("y", DataType::UInt32, false), - /// Field::new("z", DataType::Int64, false), - /// ]) - /// .to_dfschema_ref().unwrap(); + /// Field::new("x", DataType::Int64, false), + /// Field::new("y", DataType::UInt32, false), + /// Field::new("z", DataType::Int64, false), + /// ]) + /// .to_dfschema_ref() + /// .unwrap(); /// /// // Create the simplifier /// let props = ExecutionProps::new(); - /// let context = SimplifyContext::new(&props) - /// .with_schema(schema); + /// let context = SimplifyContext::new(&props).with_schema(schema); /// /// // Expression: (x >= 3) AND (y + 2 < 10) AND (z > 5) /// let expr_x = col("x").gt_eq(lit(3_i64)); @@ -312,15 +308,18 @@ impl ExprSimplifier { /// let expr = expr_x.and(expr_y).and(expr_z.clone()); /// /// let guarantees = vec![ - /// // x ∈ [3, 5] - /// ( - /// col("x"), - /// NullableInterval::NotNull { - /// values: Interval::make(Some(3_i64), Some(5_i64)).unwrap() - /// } - /// ), - /// // y = 3 - /// (col("y"), NullableInterval::from(ScalarValue::UInt32(Some(3)))), + /// // x ∈ [3, 5] + /// ( + /// col("x"), + /// NullableInterval::NotNull { + /// values: Interval::make(Some(3_i64), Some(5_i64)).unwrap(), + /// }, + /// ), + /// // y = 3 + /// ( + /// col("y"), + /// NullableInterval::from(ScalarValue::UInt32(Some(3))), + /// ), /// ]; /// let simplifier = ExprSimplifier::new(context).with_guarantees(guarantees); /// let output = simplifier.simplify(expr).unwrap(); @@ -345,24 +344,24 @@ impl ExprSimplifier { /// /// ```rust /// use arrow::datatypes::{DataType, Field, Schema}; - /// use datafusion_expr::{col, lit, Expr}; - /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval}; /// use datafusion_common::{Result, ScalarValue, ToDFSchema}; /// use datafusion_expr::execution_props::ExecutionProps; + /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval}; /// use datafusion_expr::simplify::SimplifyContext; + /// use datafusion_expr::{col, lit, Expr}; /// use datafusion_optimizer::simplify_expressions::ExprSimplifier; /// /// let schema = Schema::new(vec![ - /// Field::new("a", DataType::Int64, false), - /// Field::new("b", DataType::Int64, false), - /// Field::new("c", DataType::Int64, false), - /// ]) - /// .to_dfschema_ref().unwrap(); + /// Field::new("a", DataType::Int64, false), + /// Field::new("b", DataType::Int64, false), + /// Field::new("c", DataType::Int64, false), + /// ]) + /// .to_dfschema_ref() + /// .unwrap(); /// /// // Create the simplifier /// let props = ExecutionProps::new(); - /// let context = SimplifyContext::new(&props) - /// .with_schema(schema); + /// let context = SimplifyContext::new(&props).with_schema(schema); /// let simplifier = ExprSimplifier::new(context); /// /// // Expression: a = c AND 1 = b @@ -376,9 +375,9 @@ impl ExprSimplifier { /// /// // If canonicalization is disabled, the expression is not changed /// let non_canonicalized = simplifier - /// .with_canonicalize(false) - /// .simplify(expr.clone()) - /// .unwrap(); + /// .with_canonicalize(false) + /// .simplify(expr.clone()) + /// .unwrap(); /// /// assert_eq!(non_canonicalized, expr); /// ``` @@ -437,7 +436,6 @@ impl ExprSimplifier { /// assert_eq!(simplified_expr.data, lit(true)); /// // Only 1 cycle was executed /// assert_eq!(count, 1); - /// /// ``` pub fn with_max_cycles(mut self, max_simplifier_cycles: u32) -> Self { self.max_simplifier_cycles = max_simplifier_cycles; diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs index 5286cbd7bdf6..b1f3b006e0cf 100644 --- a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs +++ b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs @@ -53,7 +53,6 @@ //! ```text //! c1 > INT32(10) //! ``` -//! use arrow::datatypes::DataType; use datafusion_common::{internal_err, tree_node::Transformed}; diff --git a/datafusion/physical-expr/src/expressions/binary/kernels.rs b/datafusion/physical-expr/src/expressions/binary/kernels.rs index 36ecd1c81619..6c96975ed644 100644 --- a/datafusion/physical-expr/src/expressions/binary/kernels.rs +++ b/datafusion/physical-expr/src/expressions/binary/kernels.rs @@ -141,6 +141,12 @@ create_left_integral_dyn_scalar_kernel!( bitwise_shift_left_scalar ); +/// Concatenates two `StringViewArray`s element-wise. +/// If either element is `Null`, the result element is also `Null`. +/// +/// # Errors +/// - Returns an error if the input arrays have different lengths. +/// - Returns an error if any concatenated string exceeds `u32::MAX` (≈4 GB) in length. pub fn concat_elements_utf8view( left: &StringViewArray, right: &StringViewArray, @@ -166,7 +172,7 @@ pub fn concat_elements_utf8view( buffer.clear(); write!(&mut buffer, "{left}{right}") .expect("writing into string buffer failed"); - result.append_value(&buffer); + result.try_append_value(&buffer)?; } else { // at least one of the values is null, so the output is also null result.append_null() @@ -260,13 +266,13 @@ pub(crate) fn regex_match_dyn_scalar( let result: Result = match left.data_type() { DataType::Utf8 => { regexp_is_match_flag_scalar!(left, right, StringArray, not_match, flag) - }, + } DataType::Utf8View => { regexp_is_match_flag_scalar!(left, right, StringViewArray, not_match, flag) } DataType::LargeUtf8 => { regexp_is_match_flag_scalar!(left, right, LargeStringArray, not_match, flag) - }, + } DataType::Dictionary(_, _) => { let values = left.as_any_dictionary().values(); @@ -288,7 +294,7 @@ pub(crate) fn regex_match_dyn_scalar( _ => unreachable!(), } ) - }, + } other => internal_err!( "Data type {} not supported for operation 'regex_match_dyn_scalar' on string array", other diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index 2db599047bcd..0b4c3af1d9c5 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -15,25 +15,28 @@ // specific language governing permissions and limitations // under the License. +use super::{Column, Literal}; +use crate::expressions::case::ResultState::{Complete, Empty, Partial}; use crate::expressions::try_cast; use crate::PhysicalExpr; -use std::borrow::Cow; -use std::hash::Hash; -use std::{any::Any, sync::Arc}; - use arrow::array::*; use arrow::compute::kernels::zip::zip; -use arrow::compute::{and, and_not, is_null, not, nullif, or, prep_null_mask_filter}; -use arrow::datatypes::{DataType, Schema}; +use arrow::compute::{ + is_not_null, not, nullif, prep_null_mask_filter, FilterBuilder, FilterPredicate, +}; +use arrow::datatypes::{DataType, Schema, UInt32Type}; +use arrow::error::ArrowError; use datafusion_common::cast::as_boolean_array; use datafusion_common::{ exec_err, internal_datafusion_err, internal_err, DataFusionError, Result, ScalarValue, }; use datafusion_expr::ColumnarValue; - -use super::{Column, Literal}; use datafusion_physical_expr_common::datum::compare_with_eq; use itertools::Itertools; +use std::borrow::Cow; +use std::fmt::{Debug, Formatter}; +use std::hash::Hash; +use std::{any::Any, sync::Arc}; type WhenThen = (Arc, Arc); @@ -98,7 +101,7 @@ pub struct CaseExpr { } impl std::fmt::Display for CaseExpr { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { write!(f, "CASE ")?; if let Some(e) = &self.expr { write!(f, "{e} ")?; @@ -122,6 +125,419 @@ fn is_cheap_and_infallible(expr: &Arc) -> bool { expr.as_any().is::() } +/// Creates a [FilterPredicate] from a boolean array. +fn create_filter(predicate: &BooleanArray) -> FilterPredicate { + let mut filter_builder = FilterBuilder::new(predicate); + // Always optimize the filter since we use them multiple times. + filter_builder = filter_builder.optimize(); + filter_builder.build() +} + +// This should be removed when https://github.com/apache/arrow-rs/pull/8693 +// is merged and becomes available. +fn filter_record_batch( + record_batch: &RecordBatch, + filter: &FilterPredicate, +) -> std::result::Result { + let filtered_columns = record_batch + .columns() + .iter() + .map(|a| filter_array(a, filter)) + .collect::, _>>()?; + // SAFETY: since we start from a valid RecordBatch, there's no need to revalidate the schema + // since the set of columns has not changed. + // The input column arrays all had the same length (since they're coming from a valid RecordBatch) + // and the filtering them with the same filter will produces a new set of arrays with identical + // lengths. + unsafe { + Ok(RecordBatch::new_unchecked( + record_batch.schema(), + filtered_columns, + filter.count(), + )) + } +} + +// This function exists purely to be able to use the same call style +// for `filter_record_batch` and `filter_array` at the point of use. +// When https://github.com/apache/arrow-rs/pull/8693 is available, replace +// both with method calls on `FilterPredicate`. +#[inline(always)] +fn filter_array( + array: &dyn Array, + filter: &FilterPredicate, +) -> std::result::Result { + filter.filter(array) +} + +/// Merges elements by index from a list of [`ArrayData`], creating a new [`ColumnarValue`] from +/// those values. +/// +/// Each element in `indices` is the index of an array in `values`. The `indices` array is processed +/// sequentially. The first occurrence of index value `n` will be mapped to the first +/// value of the array at index `n`. The second occurrence to the second value, and so on. +/// An index value where `PartialResultIndex::is_none` is `true` is used to indicate null values. +/// +/// # Implementation notes +/// +/// This algorithm is similar in nature to both `zip` and `interleave`, but there are some important +/// differences. +/// +/// In contrast to `zip`, this function supports multiple input arrays. Instead of a boolean +/// selection vector, an index array is to take values from the input arrays, and a special marker +/// value is used to indicate null values. +/// +/// In contrast to `interleave`, this function does not use pairs of indices. The values in +/// `indices` serve the same purpose as the first value in the pairs passed to `interleave`. +/// The index in the array is implicit and is derived from the number of times a particular array +/// index occurs. +/// The more constrained indexing mechanism used by this algorithm makes it easier to copy values +/// in contiguous slices. In the example below, the two subsequent elements from array `2` can be +/// copied in a single operation from the source array instead of copying them one by one. +/// Long spans of null values are also especially cheap because they do not need to be represented +/// in an input array. +/// +/// # Safety +/// +/// This function does not check that the number of occurrences of any particular array index matches +/// the length of the corresponding input array. If an array contains more values than required, the +/// spurious values will be ignored. If an array contains fewer values than necessary, this function +/// will panic. +/// +/// # Example +/// +/// ```text +/// ┌───────────┐ ┌─────────┐ ┌─────────┐ +/// │┌─────────┐│ │ None │ │ NULL │ +/// ││ A ││ ├─────────┤ ├─────────┤ +/// │└─────────┘│ │ 1 │ │ B │ +/// │┌─────────┐│ ├─────────┤ ├─────────┤ +/// ││ B ││ │ 0 │ merge(values, indices) │ A │ +/// │└─────────┘│ ├─────────┤ ─────────────────────────▶ ├─────────┤ +/// │┌─────────┐│ │ None │ │ NULL │ +/// ││ C ││ ├─────────┤ ├─────────┤ +/// │├─────────┤│ │ 2 │ │ C │ +/// ││ D ││ ├─────────┤ ├─────────┤ +/// │└─────────┘│ │ 2 │ │ D │ +/// └───────────┘ └─────────┘ └─────────┘ +/// values indices result +/// +/// ``` +fn merge(values: &[ArrayData], indices: &[PartialResultIndex]) -> Result { + #[cfg(debug_assertions)] + for ix in indices { + if let Some(index) = ix.index() { + assert!( + index < values.len(), + "Index out of bounds: {} >= {}", + index, + values.len() + ); + } + } + + let data_refs = values.iter().collect(); + let mut mutable = MutableArrayData::new(data_refs, true, indices.len()); + + // This loop extends the mutable array by taking slices from the partial results. + // + // take_offsets keeps track of how many values have been taken from each array. + let mut take_offsets = vec![0; values.len() + 1]; + let mut start_row_ix = 0; + loop { + let array_ix = indices[start_row_ix]; + + // Determine the length of the slice to take. + let mut end_row_ix = start_row_ix + 1; + while end_row_ix < indices.len() && indices[end_row_ix] == array_ix { + end_row_ix += 1; + } + let slice_length = end_row_ix - start_row_ix; + + // Extend mutable with either nulls or with values from the array. + match array_ix.index() { + None => mutable.extend_nulls(slice_length), + Some(index) => { + let start_offset = take_offsets[index]; + let end_offset = start_offset + slice_length; + mutable.extend(index, start_offset, end_offset); + take_offsets[index] = end_offset; + } + } + + if end_row_ix == indices.len() { + break; + } else { + // Set the start_row_ix for the next slice. + start_row_ix = end_row_ix; + } + } + + Ok(make_array(mutable.freeze())) +} + +/// An index into the partial results array that's more compact than `usize`. +/// +/// `u32::MAX` is reserved as a special 'none' value. This is used instead of +/// `Option` to keep the array of indices as compact as possible. +#[derive(Copy, Clone, PartialEq, Eq)] +struct PartialResultIndex { + index: u32, +} + +const NONE_VALUE: u32 = u32::MAX; + +impl PartialResultIndex { + /// Returns the 'none' placeholder value. + fn none() -> Self { + Self { index: NONE_VALUE } + } + + fn zero() -> Self { + Self { index: 0 } + } + + /// Creates a new partial result index. + /// + /// If the provided value is greater than or equal to `u32::MAX` + /// an error will be returned. + fn try_new(index: usize) -> Result { + let Ok(index) = u32::try_from(index) else { + return internal_err!("Partial result index exceeds limit"); + }; + + if index == NONE_VALUE { + return internal_err!("Partial result index exceeds limit"); + } + + Ok(Self { index }) + } + + /// Determines if this index is the 'none' placeholder value or not. + fn is_none(&self) -> bool { + self.index == NONE_VALUE + } + + /// Returns `Some(index)` if this value is not the 'none' placeholder, `None` otherwise. + fn index(&self) -> Option { + if self.is_none() { + None + } else { + Some(self.index as usize) + } + } +} + +impl Debug for PartialResultIndex { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + if self.is_none() { + write!(f, "null") + } else { + write!(f, "{}", self.index) + } + } +} + +enum ResultState { + /// The final result is an array containing only null values. + Empty, + /// The final result needs to be computed by merging the data in `arrays`. + Partial { + // A `Vec` of partial results that should be merged. + // `partial_result_indices` contains indexes into this vec. + arrays: Vec, + // Indicates per result row from which array in `partial_results` a value should be taken. + indices: Vec, + }, + /// A single branch matched all input rows. When creating the final result, no further merging + /// of partial results is necessary. + Complete(ColumnarValue), +} + +/// A builder for constructing result arrays for CASE expressions. +/// +/// Rather than building a monolithic array containing all results, it maintains a set of +/// partial result arrays and a mapping that indicates for each row which partial array +/// contains the result value for that row. +/// +/// On finish(), the builder will merge all partial results into a single array if necessary. +/// If all rows evaluated to the same array, that array can be returned directly without +/// any merging overhead. +struct ResultBuilder { + data_type: DataType, + /// The number of rows in the final result. + row_count: usize, + state: ResultState, +} + +impl ResultBuilder { + /// Creates a new ResultBuilder that will produce arrays of the given data type. + /// + /// The `row_count` parameter indicates the number of rows in the final result. + fn new(data_type: &DataType, row_count: usize) -> Self { + Self { + data_type: data_type.clone(), + row_count, + state: Empty, + } + } + + /// Adds a result for one branch of the case expression. + /// + /// `row_indices` should be a [UInt32Array] containing [RecordBatch] relative row indices + /// for which `value` contains result values. + /// + /// If `value` is a scalar, the scalar value will be used as the value for each row in `row_indices`. + /// + /// If `value` is an array, the values from the array and the indices from `row_indices` will be + /// processed pairwise. The lengths of `value` and `row_indices` must match. + /// + /// The diagram below shows a situation where a when expression matched rows 1 and 4 of the + /// record batch. The then expression produced the value array `[A, D]`. + /// After adding this result, the result array will have been added to `partial arrays` and + /// `partial indices` will have been updated at indexes `1` and `4`. + /// + /// ```text + /// ┌─────────┐ ┌─────────┐┌───────────┐ ┌─────────┐┌───────────┐ + /// │ C │ │ 0: None ││┌ 0 ──────┐│ │ 0: None ││┌ 0 ──────┐│ + /// ├─────────┤ ├─────────┤││ A ││ ├─────────┤││ A ││ + /// │ D │ │ 1: None ││└─────────┘│ │ 1: 2 ││└─────────┘│ + /// └─────────┘ ├─────────┤│┌ 1 ──────┐│ add_branch_result( ├─────────┤│┌ 1 ──────┐│ + /// matching │ 2: 0 │││ B ││ row indices, │ 2: 0 │││ B ││ + /// 'then' values ├─────────┤│└─────────┘│ value ├─────────┤│└─────────┘│ + /// │ 3: None ││ │ ) │ 3: None ││┌ 2 ──────┐│ + /// ┌─────────┐ ├─────────┤│ │ ─────────────────────────▶ ├─────────┤││ C ││ + /// │ 1 │ │ 4: None ││ │ │ 4: 2 ││├─────────┤│ + /// ├─────────┤ ├─────────┤│ │ ├─────────┤││ D ││ + /// │ 4 │ │ 5: 1 ││ │ │ 5: 1 ││└─────────┘│ + /// └─────────┘ └─────────┘└───────────┘ └─────────┘└───────────┘ + /// row indices partial partial partial partial + /// indices arrays indices arrays + /// ``` + fn add_branch_result( + &mut self, + row_indices: &ArrayRef, + value: ColumnarValue, + ) -> Result<()> { + match value { + ColumnarValue::Array(a) => { + if a.len() != row_indices.len() { + internal_err!("Array length must match row indices length") + } else if row_indices.len() == self.row_count { + self.set_complete_result(ColumnarValue::Array(a)) + } else { + self.add_partial_result(row_indices, a.to_data()) + } + } + ColumnarValue::Scalar(s) => { + if row_indices.len() == self.row_count { + self.set_complete_result(ColumnarValue::Scalar(s)) + } else { + self.add_partial_result( + row_indices, + s.to_array_of_size(row_indices.len())?.to_data(), + ) + } + } + } + } + + /// Adds a partial result array. + /// + /// This method adds the given array data as a partial result and updates the index mapping + /// to indicate that the specified rows should take their values from this array. + /// The partial results will be merged into a single array when finish() is called. + fn add_partial_result( + &mut self, + row_indices: &ArrayRef, + row_values: ArrayData, + ) -> Result<()> { + if row_indices.null_count() != 0 { + return internal_err!("Row indices must not contain nulls"); + } + + match &mut self.state { + Empty => { + let array_index = PartialResultIndex::zero(); + let mut indices = vec![PartialResultIndex::none(); self.row_count]; + for row_ix in row_indices.as_primitive::().values().iter() { + indices[*row_ix as usize] = array_index; + } + + self.state = Partial { + arrays: vec![row_values], + indices, + }; + + Ok(()) + } + Partial { arrays, indices } => { + let array_index = PartialResultIndex::try_new(arrays.len())?; + + arrays.push(row_values); + + for row_ix in row_indices.as_primitive::().values().iter() { + // This is check is only active for debug config because the callers of this method, + // `case_when_with_expr` and `case_when_no_expr`, already ensure that + // they only calculate a value for each row at most once. + #[cfg(debug_assertions)] + if !indices[*row_ix as usize].is_none() { + return internal_err!("Duplicate value for row {}", *row_ix); + } + + indices[*row_ix as usize] = array_index; + } + Ok(()) + } + Complete(_) => internal_err!( + "Cannot add a partial result when complete result is already set" + ), + } + } + + /// Sets a result that applies to all rows. + /// + /// This is an optimization for cases where all rows evaluate to the same result. + /// When a complete result is set, the builder will return it directly from finish() + /// without any merging overhead. + fn set_complete_result(&mut self, value: ColumnarValue) -> Result<()> { + match &self.state { + Empty => { + self.state = Complete(value); + Ok(()) + } + Partial { .. } => { + internal_err!( + "Cannot set a complete result when there are already partial results" + ) + } + Complete(_) => internal_err!("Complete result already set"), + } + } + + /// Finishes building the result and returns the final array. + fn finish(self) -> Result { + match self.state { + Empty => { + // No complete result and no partial results. + // This can happen for case expressions with no else branch where no rows + // matched. + Ok(ColumnarValue::Scalar(ScalarValue::try_new_null( + &self.data_type, + )?)) + } + Partial { arrays, indices } => { + // Merge partial results into a single array. + Ok(ColumnarValue::Array(merge(&arrays, &indices)?)) + } + Complete(v) => { + // If we have a complete result, we can just return it. + Ok(v) + } + } + } +} + impl CaseExpr { /// Create a new CASE WHEN expression pub fn try_new( @@ -196,82 +612,146 @@ impl CaseExpr { /// END fn case_when_with_expr(&self, batch: &RecordBatch) -> Result { let return_type = self.data_type(&batch.schema())?; - let expr = self.expr.as_ref().unwrap(); - let base_value = expr.evaluate(batch)?; - let base_value = base_value.into_array(batch.num_rows())?; - let base_nulls = is_null(base_value.as_ref())?; - - // start with nulls as default output - let mut current_value = new_null_array(&return_type, batch.num_rows()); - // We only consider non-null values while comparing with whens - let mut remainder = not(&base_nulls)?; - let mut non_null_remainder_count = remainder.true_count(); - for i in 0..self.when_then_expr.len() { - // If there are no rows left to process, break out of the loop early - if non_null_remainder_count == 0 { - break; - } + let mut result_builder = ResultBuilder::new(&return_type, batch.num_rows()); + + // `remainder_rows` contains the indices of the rows that need to be evaluated + let mut remainder_rows: ArrayRef = + Arc::new(UInt32Array::from_iter_values(0..batch.num_rows() as u32)); + // `remainder_batch` contains the rows themselves that need to be evaluated + let mut remainder_batch = Cow::Borrowed(batch); + + // evaluate the base expression + let mut base_values = self + .expr + .as_ref() + .unwrap() + .evaluate(batch)? + .into_array(batch.num_rows())?; - let when_predicate = &self.when_then_expr[i].0; - let when_value = when_predicate.evaluate_selection(batch, &remainder)?; - let when_value = when_value.into_array(batch.num_rows())?; - // build boolean array representing which rows match the "when" value - let when_match = compare_with_eq( - &when_value, - &base_value, - // The types of case and when expressions will be coerced to match. - // We only need to check if the base_value is nested. - base_value.data_type().is_nested(), - )?; - // Treat nulls as false - let when_match = match when_match.null_count() { - 0 => Cow::Borrowed(&when_match), - _ => Cow::Owned(prep_null_mask_filter(&when_match)), - }; - // Make sure we only consider rows that have not been matched yet - let when_value = and(&when_match, &remainder)?; + // Fill in a result value already for rows where the base expression value is null + // Since each when expression is tested against the base expression using the equality + // operator, null base values can never match any when expression. `x = NULL` is falsy, + // for all possible values of `x`. + if base_values.null_count() > 0 { + // Use `is_not_null` since this is a cheap clone of the null buffer from 'base_value'. + // We already checked there are nulls, so we can be sure a new buffer will not be + // created. + let base_not_nulls = is_not_null(base_values.as_ref())?; + let base_all_null = base_values.null_count() == remainder_batch.num_rows(); + + // If there is an else expression, use that as the default value for the null rows + // Otherwise the default `null` value from the result builder will be used. + if let Some(e) = self.else_expr() { + let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?; - // If the predicate did not match any rows, continue to the next branch immediately - let when_match_count = when_value.true_count(); - if when_match_count == 0 { - continue; + if base_all_null { + // All base values were null, so no need to filter + let nulls_value = expr.evaluate(&remainder_batch)?; + result_builder.add_branch_result(&remainder_rows, nulls_value)?; + } else { + // Filter out the null rows and evaluate the else expression for those + let nulls_filter = create_filter(¬(&base_not_nulls)?); + let nulls_batch = + filter_record_batch(&remainder_batch, &nulls_filter)?; + let nulls_rows = filter_array(&remainder_rows, &nulls_filter)?; + let nulls_value = expr.evaluate(&nulls_batch)?; + result_builder.add_branch_result(&nulls_rows, nulls_value)?; + } } - let then_expression = &self.when_then_expr[i].1; - let then_value = then_expression.evaluate_selection(batch, &when_value)?; + // All base values are null, so we can return early + if base_all_null { + return result_builder.finish(); + } - current_value = match then_value { - ColumnarValue::Scalar(ScalarValue::Null) => { - nullif(current_value.as_ref(), &when_value)? - } - ColumnarValue::Scalar(then_value) => { - zip(&when_value, &then_value.to_scalar()?, ¤t_value)? + // Remove the null rows from the remainder batch + let not_null_filter = create_filter(&base_not_nulls); + remainder_batch = + Cow::Owned(filter_record_batch(&remainder_batch, ¬_null_filter)?); + remainder_rows = filter_array(&remainder_rows, ¬_null_filter)?; + base_values = filter_array(&base_values, ¬_null_filter)?; + } + + // The types of case and when expressions will be coerced to match. + // We only need to check if the base_value is nested. + let base_value_is_nested = base_values.data_type().is_nested(); + + for i in 0..self.when_then_expr.len() { + // Evaluate the 'when' predicate for the remainder batch + // This results in a boolean array with the same length as the remaining number of rows + let when_expr = &self.when_then_expr[i].0; + let when_value = match when_expr.evaluate(&remainder_batch)? { + ColumnarValue::Array(a) => { + compare_with_eq(&a, &base_values, base_value_is_nested) } - ColumnarValue::Array(then_value) => { - zip(&when_value, &then_value, ¤t_value)? + ColumnarValue::Scalar(s) => { + let scalar = Scalar::new(s.to_array()?); + compare_with_eq(&scalar, &base_values, base_value_is_nested) } - }; + }?; - remainder = and_not(&remainder, &when_value)?; - non_null_remainder_count -= when_match_count; - } + // `true_count` ignores `true` values where the validity bit is not set, so there's + // no need to call `prep_null_mask_filter`. + let when_true_count = when_value.true_count(); - if let Some(e) = self.else_expr() { - // null and unmatched tuples should be assigned else value - remainder = or(&base_nulls, &remainder)?; + // If the 'when' predicate did not match any rows, continue to the next branch immediately + if when_true_count == 0 { + continue; + } - if remainder.true_count() > 0 { - // keep `else_expr`'s data type and return type consistent - let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?; + // If the 'when' predicate matched all remaining rows, there is no need to filter + if when_true_count == remainder_batch.num_rows() { + let then_expression = &self.when_then_expr[i].1; + let then_value = then_expression.evaluate(&remainder_batch)?; + result_builder.add_branch_result(&remainder_rows, then_value)?; + return result_builder.finish(); + } + + // Filter the remainder batch based on the 'when' value + // This results in a batch containing only the rows that need to be evaluated + // for the current branch + // Still no need to call `prep_null_mask_filter` since `create_filter` will already do + // this unconditionally. + let then_filter = create_filter(&when_value); + let then_batch = filter_record_batch(&remainder_batch, &then_filter)?; + let then_rows = filter_array(&remainder_rows, &then_filter)?; - let else_ = expr - .evaluate_selection(batch, &remainder)? - .into_array(batch.num_rows())?; - current_value = zip(&remainder, &else_, ¤t_value)?; + let then_expression = &self.when_then_expr[i].1; + let then_value = then_expression.evaluate(&then_batch)?; + result_builder.add_branch_result(&then_rows, then_value)?; + + // If this is the last 'when' branch and there is no 'else' expression, there's no + // point in calculating the remaining rows. + if self.else_expr.is_none() && i == self.when_then_expr.len() - 1 { + return result_builder.finish(); } + + // Prepare the next when branch (or the else branch) + let next_selection = match when_value.null_count() { + 0 => not(&when_value), + _ => { + // `prep_null_mask_filter` is required to ensure the not operation treats nulls + // as false + not(&prep_null_mask_filter(&when_value)) + } + }?; + let next_filter = create_filter(&next_selection); + remainder_batch = + Cow::Owned(filter_record_batch(&remainder_batch, &next_filter)?); + remainder_rows = filter_array(&remainder_rows, &next_filter)?; + base_values = filter_array(&base_values, &next_filter)?; + } + + // If we reached this point, some rows were left unmatched. + // Check if those need to be evaluated using the 'else' expression. + if let Some(e) = self.else_expr() { + // keep `else_expr`'s data type and return type consistent + let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?; + let else_value = expr.evaluate(&remainder_batch)?; + result_builder.add_branch_result(&remainder_rows, else_value)?; } - Ok(ColumnarValue::Array(current_value)) + result_builder.finish() } /// This function evaluates the form of CASE where each WHEN expression is a boolean @@ -283,70 +763,86 @@ impl CaseExpr { /// END fn case_when_no_expr(&self, batch: &RecordBatch) -> Result { let return_type = self.data_type(&batch.schema())?; + let mut result_builder = ResultBuilder::new(&return_type, batch.num_rows()); - // start with nulls as default output - let mut current_value = new_null_array(&return_type, batch.num_rows()); - let mut remainder = BooleanArray::from(vec![true; batch.num_rows()]); - let mut remainder_count = batch.num_rows(); - for i in 0..self.when_then_expr.len() { - // If there are no rows left to process, break out of the loop early - if remainder_count == 0 { - break; - } + // `remainder_rows` contains the indices of the rows that need to be evaluated + let mut remainder_rows: ArrayRef = + Arc::new(UInt32Array::from_iter(0..batch.num_rows() as u32)); + // `remainder_batch` contains the rows themselves that need to be evaluated + let mut remainder_batch = Cow::Borrowed(batch); + for i in 0..self.when_then_expr.len() { + // Evaluate the 'when' predicate for the remainder batch + // This results in a boolean array with the same length as the remaining number of rows let when_predicate = &self.when_then_expr[i].0; - let when_value = when_predicate.evaluate_selection(batch, &remainder)?; - let when_value = when_value.into_array(batch.num_rows())?; + let when_value = when_predicate + .evaluate(&remainder_batch)? + .into_array(remainder_batch.num_rows())?; let when_value = as_boolean_array(&when_value).map_err(|_| { internal_datafusion_err!("WHEN expression did not return a BooleanArray") })?; - // Treat 'NULL' as false value - let when_value = match when_value.null_count() { - 0 => Cow::Borrowed(when_value), - _ => Cow::Owned(prep_null_mask_filter(when_value)), - }; - // Make sure we only consider rows that have not been matched yet - let when_value = and(&when_value, &remainder)?; - // If the predicate did not match any rows, continue to the next branch immediately - let when_match_count = when_value.true_count(); - if when_match_count == 0 { + // `true_count` ignores `true` values where the validity bit is not set, so there's + // no need to call `prep_null_mask_filter`. + let when_true_count = when_value.true_count(); + + // If the 'when' predicate did not match any rows, continue to the next branch immediately + if when_true_count == 0 { continue; } + // If the 'when' predicate matched all remaining rows, there is no need to filter + if when_true_count == remainder_batch.num_rows() { + let then_expression = &self.when_then_expr[i].1; + let then_value = then_expression.evaluate(&remainder_batch)?; + result_builder.add_branch_result(&remainder_rows, then_value)?; + return result_builder.finish(); + } + + // Filter the remainder batch based on the 'when' value + // This results in a batch containing only the rows that need to be evaluated + // for the current branch + // Still no need to call `prep_null_mask_filter` since `create_filter` will already do + // this unconditionally. + let then_filter = create_filter(when_value); + let then_batch = filter_record_batch(&remainder_batch, &then_filter)?; + let then_rows = filter_array(&remainder_rows, &then_filter)?; + let then_expression = &self.when_then_expr[i].1; - let then_value = then_expression.evaluate_selection(batch, &when_value)?; + let then_value = then_expression.evaluate(&then_batch)?; + result_builder.add_branch_result(&then_rows, then_value)?; - current_value = match then_value { - ColumnarValue::Scalar(ScalarValue::Null) => { - nullif(current_value.as_ref(), &when_value)? - } - ColumnarValue::Scalar(then_value) => { - zip(&when_value, &then_value.to_scalar()?, ¤t_value)? - } - ColumnarValue::Array(then_value) => { - zip(&when_value, &then_value, ¤t_value)? - } - }; + // If this is the last 'when' branch and there is no 'else' expression, there's no + // point in calculating the remaining rows. + if self.else_expr.is_none() && i == self.when_then_expr.len() - 1 { + return result_builder.finish(); + } - // Succeed tuples should be filtered out for short-circuit evaluation, - // null values for the current when expr should be kept - remainder = and_not(&remainder, &when_value)?; - remainder_count -= when_match_count; + // Prepare the next when branch (or the else branch) + let next_selection = match when_value.null_count() { + 0 => not(when_value), + _ => { + // `prep_null_mask_filter` is required to ensure the not operation treats nulls + // as false + not(&prep_null_mask_filter(when_value)) + } + }?; + let next_filter = create_filter(&next_selection); + remainder_batch = + Cow::Owned(filter_record_batch(&remainder_batch, &next_filter)?); + remainder_rows = filter_array(&remainder_rows, &next_filter)?; } + // If we reached this point, some rows were left unmatched. + // Check if those need to be evaluated using the 'else' expression. if let Some(e) = self.else_expr() { - if remainder_count > 0 { - // keep `else_expr`'s data type and return type consistent - let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?; - let else_ = expr - .evaluate_selection(batch, &remainder)? - .into_array(batch.num_rows())?; - current_value = zip(&remainder, &else_, ¤t_value)?; - } + // keep `else_expr`'s data type and return type consistent + let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?; + let else_value = expr.evaluate(&remainder_batch)?; + result_builder.add_branch_result(&remainder_rows, else_value)?; } - Ok(ColumnarValue::Array(current_value)) + result_builder.finish() } /// This function evaluates the specialized case of: @@ -587,7 +1083,7 @@ impl PhysicalExpr for CaseExpr { } } - fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fn fmt_sql(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "CASE ")?; if let Some(e) = &self.expr { e.fmt_sql(f)?; diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 407e3e6a9d29..0419161b532c 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -439,8 +439,8 @@ mod tests { let expression = cast_with_options(col("a", &schema)?, &schema, Decimal128(6, 2), None)?; let e = expression.evaluate(&batch).unwrap_err().strip_backtrace(); // panics on OK - assert_snapshot!(e, @"Arrow error: Invalid argument error: 12345679 is too large to store in a Decimal128 of precision 6. Max is 999999"); - + assert_snapshot!(e, @"Arrow error: Invalid argument error: 123456.79 is too large to store in a Decimal128 of precision 6. Max is 9999.99"); + // safe cast should return null let expression_safe = cast_with_options( col("a", &schema)?, &schema, diff --git a/datafusion/physical-expr/src/expressions/dynamic_filters.rs b/datafusion/physical-expr/src/expressions/dynamic_filters.rs index a53b32c97689..964a193db833 100644 --- a/datafusion/physical-expr/src/expressions/dynamic_filters.rs +++ b/datafusion/physical-expr/src/expressions/dynamic_filters.rs @@ -381,14 +381,14 @@ mod test { ) .unwrap(); let snap = dynamic_filter_1.snapshot().unwrap().unwrap(); - insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 0 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }"#); + insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 0 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32 } }, fail_on_overflow: false }"#); let dynamic_filter_2 = reassign_expr_columns( Arc::clone(&dynamic_filter) as Arc, &filter_schema_2, ) .unwrap(); let snap = dynamic_filter_2.snapshot().unwrap().unwrap(); - insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 1 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }"#); + insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 1 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32 } }, fail_on_overflow: false }"#); // Both filters allow evaluating the same expression let batch_1 = RecordBatch::try_new( Arc::clone(&filter_schema_1), diff --git a/datafusion/physical-expr/src/expressions/literal.rs b/datafusion/physical-expr/src/expressions/literal.rs index 6e425ee439d6..94e91d43a1c4 100644 --- a/datafusion/physical-expr/src/expressions/literal.rs +++ b/datafusion/physical-expr/src/expressions/literal.rs @@ -28,8 +28,8 @@ use arrow::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; +use datafusion_common::metadata::FieldMetadata; use datafusion_common::{Result, ScalarValue}; -use datafusion_expr::expr::FieldMetadata; use datafusion_expr::Expr; use datafusion_expr_common::columnar_value::ColumnarValue; use datafusion_expr_common::interval_arithmetic::Interval; diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 73df60c42e96..7790380dffd5 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -25,13 +25,12 @@ use crate::{ use arrow::datatypes::Schema; use datafusion_common::config::ConfigOptions; +use datafusion_common::metadata::FieldMetadata; use datafusion_common::{ exec_err, not_impl_err, plan_err, DFSchema, Result, ScalarValue, ToDFSchema, }; use datafusion_expr::execution_props::ExecutionProps; -use datafusion_expr::expr::{ - Alias, Cast, FieldMetadata, InList, Placeholder, ScalarFunction, -}; +use datafusion_expr::expr::{Alias, Cast, InList, Placeholder, ScalarFunction}; use datafusion_expr::var_provider::is_system_variables; use datafusion_expr::var_provider::VarType; use datafusion_expr::{ diff --git a/datafusion/physical-expr/src/projection.rs b/datafusion/physical-expr/src/projection.rs index e35bfbb3a20d..fc972d644e67 100644 --- a/datafusion/physical-expr/src/projection.rs +++ b/datafusion/physical-expr/src/projection.rs @@ -100,24 +100,24 @@ impl From for (Arc, String) { /// representing a complete projection operation and provides /// methods to manipulate and analyze the projection as a whole. #[derive(Debug, Clone)] -pub struct Projection { +pub struct ProjectionExprs { exprs: Vec, } -impl std::fmt::Display for Projection { +impl std::fmt::Display for ProjectionExprs { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let exprs: Vec = self.exprs.iter().map(|e| e.to_string()).collect(); write!(f, "Projection[{}]", exprs.join(", ")) } } -impl From> for Projection { +impl From> for ProjectionExprs { fn from(value: Vec) -> Self { Self { exprs: value } } } -impl From<&[ProjectionExpr]> for Projection { +impl From<&[ProjectionExpr]> for ProjectionExprs { fn from(value: &[ProjectionExpr]) -> Self { Self { exprs: value.to_vec(), @@ -125,15 +125,83 @@ impl From<&[ProjectionExpr]> for Projection { } } -impl AsRef<[ProjectionExpr]> for Projection { +impl FromIterator for ProjectionExprs { + fn from_iter>(exprs: T) -> Self { + Self { + exprs: exprs.into_iter().collect::>(), + } + } +} + +impl AsRef<[ProjectionExpr]> for ProjectionExprs { fn as_ref(&self) -> &[ProjectionExpr] { &self.exprs } } -impl Projection { - pub fn new(exprs: Vec) -> Self { - Self { exprs } +impl ProjectionExprs { + pub fn new(exprs: I) -> Self + where + I: IntoIterator, + { + Self { + exprs: exprs.into_iter().collect::>(), + } + } + + /// Creates a [`ProjectionExpr`] from a list of column indices. + /// + /// This is a convenience method for creating simple column-only projections, where each projection expression is a reference to a column + /// in the input schema. + /// + /// # Behavior + /// - Ordering: the output projection preserves the exact order of indices provided in the input slice + /// For example, `[2, 0, 1]` will produce projections for columns 2, 0, then 1 in that order + /// - Duplicates: Duplicate indices are allowed and will create multiple projection expressions referencing the same source column + /// For example, `[0, 0]` creates 2 separate projections both referencing column 0 + /// + /// # Panics + /// Panics if any index in `indices` is out of bounds for the provided schema. + /// + /// # Example + /// + /// ```rust + /// use std::sync::Arc; + /// use arrow::datatypes::{Schema, Field, DataType}; + /// use datafusion_physical_expr::projection::ProjectionExprs; + /// + /// // Create a schema with three columns + /// let schema = Arc::new(Schema::new(vec![ + /// Field::new("a", DataType::Int32, false), + /// Field::new("b", DataType::Utf8, false), + /// Field::new("c", DataType::Float64, false), + /// ])); + /// + /// // Project columns at indices 2 and 0 (c and a) - ordering is preserved + /// let projection = ProjectionExprs::from_indices(&[2, 0], &schema); + /// + /// // This creates: SELECT c@2 AS c, a@0 AS a + /// assert_eq!(projection.as_ref().len(), 2); + /// assert_eq!(projection.as_ref()[0].alias, "c"); + /// assert_eq!(projection.as_ref()[1].alias, "a"); + /// + /// // Duplicate indices are allowed + /// let projection_with_dups = ProjectionExprs::from_indices(&[0, 0, 1], &schema); + /// assert_eq!(projection_with_dups.as_ref().len(), 3); + /// assert_eq!(projection_with_dups.as_ref()[0].alias, "a"); + /// assert_eq!(projection_with_dups.as_ref()[1].alias, "a"); // duplicate + /// assert_eq!(projection_with_dups.as_ref()[2].alias, "b"); + /// ``` + pub fn from_indices(indices: &[usize], schema: &SchemaRef) -> Self { + let projection_exprs = indices.iter().map(|&i| { + let field = schema.field(i); + ProjectionExpr { + expr: Arc::new(Column::new(field.name(), i)), + alias: field.name().clone(), + } + }); + + Self::from_iter(projection_exprs) } /// Returns an iterator over the projection expressions @@ -167,7 +235,7 @@ impl Projection { /// /// ```rust /// use std::sync::Arc; - /// use datafusion_physical_expr::projection::{Projection, ProjectionExpr}; + /// use datafusion_physical_expr::projection::{ProjectionExprs, ProjectionExpr}; /// use datafusion_physical_expr::expressions::{Column, BinaryExpr, Literal}; /// use datafusion_common::{Result, ScalarValue}; /// use datafusion_expr::Operator; @@ -175,7 +243,7 @@ impl Projection { /// fn main() -> Result<()> { /// // Example from the docstring: /// // Base projection: SELECT c@2 AS x, b@1 AS y, a@0 AS z - /// let base = Projection::new(vec![ + /// let base = ProjectionExprs::new(vec![ /// ProjectionExpr { /// expr: Arc::new(Column::new("c", 2)), /// alias: "x".to_string(), @@ -191,7 +259,7 @@ impl Projection { /// ]); /// /// // Top projection: SELECT x@0 + 1 AS c1, y@1 + z@2 AS c2 - /// let top = Projection::new(vec![ + /// let top = ProjectionExprs::new(vec![ /// ProjectionExpr { /// expr: Arc::new(BinaryExpr::new( /// Arc::new(Column::new("x", 0)), @@ -224,7 +292,7 @@ impl Projection { /// # Errors /// This function returns an error if any expression in the `other` projection cannot be /// applied on top of this projection. - pub fn try_merge(&self, other: &Projection) -> Result { + pub fn try_merge(&self, other: &ProjectionExprs) -> Result { let mut new_exprs = Vec::with_capacity(other.exprs.len()); for proj_expr in &other.exprs { let new_expr = update_expr(&proj_expr.expr, &self.exprs, true)? @@ -240,7 +308,7 @@ impl Projection { alias: proj_expr.alias.clone(), }); } - Ok(Projection::new(new_exprs)) + Ok(ProjectionExprs::new(new_exprs)) } /// Extract the column indices used in this projection. @@ -256,6 +324,46 @@ impl Projection { .collect_vec() } + /// Extract the ordered column indices for a column-only projection. + /// + /// This function assumes that all expressions in the projection are simple column references. + /// It returns the column indices in the order they appear in the projection. + /// + /// # Panics + /// + /// Panics if any expression in the projection is not a simple column reference. This includes: + /// - Computed expressions (e.g., `a + 1`, `CAST(a AS INT)`) + /// - Function calls (e.g., `UPPER(name)`, `SUM(amount)`) + /// - Literals (e.g., `42`, `'hello'`) + /// - Complex nested expressions (e.g., `CASE WHEN ... THEN ... END`) + /// + /// # Returns + /// + /// A vector of column indices in projection order. Unlike [`column_indices()`](Self::column_indices), + /// this function: + /// - Preserves the projection order (does not sort) + /// - Preserves duplicates (does not deduplicate) + /// + /// # Example + /// + /// For a projection `SELECT c, a, c` where `a` is at index 0 and `c` is at index 2, + /// this function would return `[2, 0, 2]`. + /// + /// Use [`column_indices()`](Self::column_indices) instead if the projection may contain + /// non-column expressions or if you need a deduplicated sorted list. + pub fn ordered_column_indices(&self) -> Vec { + self.exprs + .iter() + .map(|e| { + e.expr + .as_any() + .downcast_ref::() + .expect("Expected column reference in projection") + .index() + }) + .collect() + } + /// Project a schema according to this projection. /// For example, for a projection `SELECT a AS x, b + 1 AS y`, where `a` is at index 0 and `b` is at index 1, /// if the input schema is `[a: Int32, b: Int32, c: Int32]`, the output schema would be `[x: Int32, y: Int32]`. @@ -327,7 +435,7 @@ impl Projection { } } -impl<'a> IntoIterator for &'a Projection { +impl<'a> IntoIterator for &'a ProjectionExprs { type Item = &'a ProjectionExpr; type IntoIter = std::slice::Iter<'a, ProjectionExpr>; @@ -336,7 +444,7 @@ impl<'a> IntoIterator for &'a Projection { } } -impl IntoIterator for Projection { +impl IntoIterator for ProjectionExprs { type Item = ProjectionExpr; type IntoIter = std::vec::IntoIter; @@ -1570,7 +1678,7 @@ pub(crate) mod tests { let source = get_stats(); let schema = get_schema(); - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("col1", 1)), alias: "col1".to_string(), @@ -1612,7 +1720,7 @@ pub(crate) mod tests { let source = get_stats(); let schema = get_schema(); - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("col2", 2)), alias: "col2".to_string(), @@ -1663,7 +1771,7 @@ pub(crate) mod tests { alias: "b".to_string(), }, ]; - let projection = Projection::new(exprs.clone()); + let projection = ProjectionExprs::new(exprs.clone()); assert_eq!(projection.as_ref().len(), 2); Ok(()) } @@ -1674,7 +1782,7 @@ pub(crate) mod tests { expr: Arc::new(Column::new("x", 0)), alias: "x".to_string(), }]; - let projection: Projection = exprs.clone().into(); + let projection: ProjectionExprs = exprs.clone().into(); assert_eq!(projection.as_ref().len(), 1); Ok(()) } @@ -1691,7 +1799,7 @@ pub(crate) mod tests { alias: "col2".to_string(), }, ]; - let projection = Projection::new(exprs); + let projection = ProjectionExprs::new(exprs); let as_ref: &[ProjectionExpr] = projection.as_ref(); assert_eq!(as_ref.len(), 2); Ok(()) @@ -1700,7 +1808,7 @@ pub(crate) mod tests { #[test] fn test_column_indices_multiple_columns() -> Result<()> { // Test with reversed column order to ensure proper reordering - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("c", 5)), alias: "c".to_string(), @@ -1722,7 +1830,7 @@ pub(crate) mod tests { #[test] fn test_column_indices_duplicates() -> Result<()> { // Test that duplicate column indices appear only once - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("a", 1)), alias: "a".to_string(), @@ -1743,7 +1851,7 @@ pub(crate) mod tests { #[test] fn test_column_indices_unsorted() -> Result<()> { // Test that column indices are sorted in the output - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("c", 5)), alias: "c".to_string(), @@ -1769,7 +1877,7 @@ pub(crate) mod tests { Operator::Plus, Arc::new(Column::new("b", 4)), )); - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr, alias: "sum".to_string(), @@ -1786,7 +1894,7 @@ pub(crate) mod tests { #[test] fn test_column_indices_empty() -> Result<()> { - let projection = Projection::new(vec![]); + let projection = ProjectionExprs::new(vec![]); assert_eq!(projection.column_indices(), Vec::::new()); Ok(()) } @@ -1794,7 +1902,7 @@ pub(crate) mod tests { #[test] fn test_merge_simple_columns() -> Result<()> { // First projection: SELECT c@2 AS x, b@1 AS y, a@0 AS z - let base_projection = Projection::new(vec![ + let base_projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("c", 2)), alias: "x".to_string(), @@ -1810,7 +1918,7 @@ pub(crate) mod tests { ]); // Second projection: SELECT y@1 AS col2, x@0 AS col1 - let top_projection = Projection::new(vec![ + let top_projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("y", 1)), alias: "col2".to_string(), @@ -1831,7 +1939,7 @@ pub(crate) mod tests { #[test] fn test_merge_with_expressions() -> Result<()> { // First projection: SELECT c@2 AS x, b@1 AS y, a@0 AS z - let base_projection = Projection::new(vec![ + let base_projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("c", 2)), alias: "x".to_string(), @@ -1847,7 +1955,7 @@ pub(crate) mod tests { ]); // Second projection: SELECT y@1 + z@2 AS c2, x@0 + 1 AS c1 - let top_projection = Projection::new(vec![ + let top_projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(BinaryExpr::new( Arc::new(Column::new("y", 1)), @@ -1876,7 +1984,7 @@ pub(crate) mod tests { #[test] fn try_merge_error() { // Create a base projection - let base = Projection::new(vec![ + let base = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("a", 0)), alias: "x".to_string(), @@ -1888,7 +1996,7 @@ pub(crate) mod tests { ]); // Create a top projection that references a non-existent column index - let top = Projection::new(vec![ProjectionExpr { + let top = ProjectionExprs::new(vec![ProjectionExpr { expr: Arc::new(Column::new("z", 5)), // Invalid index alias: "result".to_string(), }]); @@ -1907,7 +2015,7 @@ pub(crate) mod tests { let input_schema = get_schema(); // Projection: SELECT col2 AS c, col0 AS a - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("col2", 2)), alias: "c".to_string(), @@ -1940,7 +2048,7 @@ pub(crate) mod tests { let input_schema = get_schema(); // Projection: SELECT col0 + 1 AS incremented - let projection = Projection::new(vec![ProjectionExpr { + let projection = ProjectionExprs::new(vec![ProjectionExpr { expr: Arc::new(BinaryExpr::new( Arc::new(Column::new("col0", 0)), Operator::Plus, @@ -1974,7 +2082,7 @@ pub(crate) mod tests { ]); // Projection: SELECT col0 AS renamed - let projection = Projection::new(vec![ProjectionExpr { + let projection = ProjectionExprs::new(vec![ProjectionExpr { expr: Arc::new(Column::new("col0", 0)), alias: "renamed".to_string(), }]); @@ -1994,7 +2102,7 @@ pub(crate) mod tests { #[test] fn test_project_schema_empty() -> Result<()> { let input_schema = get_schema(); - let projection = Projection::new(vec![]); + let projection = ProjectionExprs::new(vec![]); let output_schema = projection.project_schema(&input_schema)?; @@ -2009,7 +2117,7 @@ pub(crate) mod tests { let input_schema = get_schema(); // Projection: SELECT col1 AS text, col0 AS num - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("col1", 1)), alias: "text".to_string(), @@ -2057,7 +2165,7 @@ pub(crate) mod tests { let input_schema = get_schema(); // Projection with expression: SELECT col0 + 1 AS incremented, col1 AS text - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(BinaryExpr::new( Arc::new(Column::new("col0", 0)), @@ -2105,7 +2213,7 @@ pub(crate) mod tests { let input_schema = get_schema(); // Projection with only primitive width columns: SELECT col2 AS f, col0 AS i - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("col2", 2)), alias: "f".to_string(), @@ -2136,7 +2244,7 @@ pub(crate) mod tests { let input_stats = get_stats(); let input_schema = get_schema(); - let projection = Projection::new(vec![]); + let projection = ProjectionExprs::new(vec![]); let output_stats = projection.project_statistics(input_stats, &input_schema)?; diff --git a/datafusion/physical-plan/src/aggregates/group_values/metrics.rs b/datafusion/physical-plan/src/aggregates/group_values/metrics.rs new file mode 100644 index 000000000000..c4e29ea71060 --- /dev/null +++ b/datafusion/physical-plan/src/aggregates/group_values/metrics.rs @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Metrics for the various group-by implementations. + +use crate::metrics::{ExecutionPlanMetricsSet, MetricBuilder, Time}; + +pub(crate) struct GroupByMetrics { + /// Time spent calculating the group IDs from the evaluated grouping columns. + pub(crate) time_calculating_group_ids: Time, + /// Time spent evaluating the inputs to the aggregate functions. + pub(crate) aggregate_arguments_time: Time, + /// Time spent evaluating the aggregate expressions themselves + /// (e.g. summing all elements and counting number of elements for `avg` aggregate). + pub(crate) aggregation_time: Time, + /// Time spent emitting the final results and constructing the record batch + /// which includes finalizing the grouping expressions + /// (e.g. emit from the hash table in case of hash aggregation) and the accumulators + pub(crate) emitting_time: Time, +} + +impl GroupByMetrics { + pub(crate) fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self { + Self { + time_calculating_group_ids: MetricBuilder::new(metrics) + .subset_time("time_calculating_group_ids", partition), + aggregate_arguments_time: MetricBuilder::new(metrics) + .subset_time("aggregate_arguments_time", partition), + aggregation_time: MetricBuilder::new(metrics) + .subset_time("aggregation_time", partition), + emitting_time: MetricBuilder::new(metrics) + .subset_time("emitting_time", partition), + } + } +} + +#[cfg(test)] +mod tests { + use crate::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}; + use crate::metrics::MetricsSet; + use crate::test::TestMemoryExec; + use crate::{collect, ExecutionPlan}; + use arrow::array::{Float64Array, UInt32Array}; + use arrow::datatypes::{DataType, Field, Schema}; + use arrow::record_batch::RecordBatch; + use datafusion_common::Result; + use datafusion_execution::TaskContext; + use datafusion_functions_aggregate::count::count_udaf; + use datafusion_functions_aggregate::sum::sum_udaf; + use datafusion_physical_expr::aggregate::AggregateExprBuilder; + use datafusion_physical_expr::expressions::col; + use std::sync::Arc; + + /// Helper function to verify all three GroupBy metrics exist and have non-zero values + fn assert_groupby_metrics(metrics: &MetricsSet) { + let agg_arguments_time = metrics.sum_by_name("aggregate_arguments_time"); + assert!(agg_arguments_time.is_some()); + assert!(agg_arguments_time.unwrap().as_usize() > 0); + + let aggregation_time = metrics.sum_by_name("aggregation_time"); + assert!(aggregation_time.is_some()); + assert!(aggregation_time.unwrap().as_usize() > 0); + + let emitting_time = metrics.sum_by_name("emitting_time"); + assert!(emitting_time.is_some()); + assert!(emitting_time.unwrap().as_usize() > 0); + } + + #[tokio::test] + async fn test_groupby_metrics_partial_mode() -> Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::UInt32, false), + Field::new("b", DataType::Float64, false), + ])); + + // Create multiple batches to ensure metrics accumulate + let batches = (0..5) + .map(|i| { + RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(UInt32Array::from(vec![1, 2, 3, 4])), + Arc::new(Float64Array::from(vec![ + i as f64, + (i + 1) as f64, + (i + 2) as f64, + (i + 3) as f64, + ])), + ], + ) + .unwrap() + }) + .collect::>(); + + let input = TestMemoryExec::try_new_exec(&[batches], Arc::clone(&schema), None)?; + + let group_by = + PhysicalGroupBy::new_single(vec![(col("a", &schema)?, "a".to_string())]); + + let aggregates = vec![ + Arc::new( + AggregateExprBuilder::new(sum_udaf(), vec![col("b", &schema)?]) + .schema(Arc::clone(&schema)) + .alias("SUM(b)") + .build()?, + ), + Arc::new( + AggregateExprBuilder::new(count_udaf(), vec![col("b", &schema)?]) + .schema(Arc::clone(&schema)) + .alias("COUNT(b)") + .build()?, + ), + ]; + + let aggregate_exec = Arc::new(AggregateExec::try_new( + AggregateMode::Partial, + group_by, + aggregates, + vec![None, None], + input, + schema, + )?); + + let task_ctx = Arc::new(TaskContext::default()); + let _result = + collect(Arc::clone(&aggregate_exec) as _, Arc::clone(&task_ctx)).await?; + + let metrics = aggregate_exec.metrics().unwrap(); + assert_groupby_metrics(&metrics); + + Ok(()) + } + + #[tokio::test] + async fn test_groupby_metrics_final_mode() -> Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::UInt32, false), + Field::new("b", DataType::Float64, false), + ])); + + let batches = (0..3) + .map(|i| { + RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(UInt32Array::from(vec![1, 2, 3])), + Arc::new(Float64Array::from(vec![ + i as f64, + (i + 1) as f64, + (i + 2) as f64, + ])), + ], + ) + .unwrap() + }) + .collect::>(); + + let partial_input = + TestMemoryExec::try_new_exec(&[batches], Arc::clone(&schema), None)?; + + let group_by = + PhysicalGroupBy::new_single(vec![(col("a", &schema)?, "a".to_string())]); + + let aggregates = vec![Arc::new( + AggregateExprBuilder::new(sum_udaf(), vec![col("b", &schema)?]) + .schema(Arc::clone(&schema)) + .alias("SUM(b)") + .build()?, + )]; + + // Create partial aggregate + let partial_aggregate = Arc::new(AggregateExec::try_new( + AggregateMode::Partial, + group_by.clone(), + aggregates.clone(), + vec![None], + partial_input, + Arc::clone(&schema), + )?); + + // Create final aggregate + let final_aggregate = Arc::new(AggregateExec::try_new( + AggregateMode::Final, + group_by.as_final(), + aggregates, + vec![None], + partial_aggregate, + schema, + )?); + + let task_ctx = Arc::new(TaskContext::default()); + let _result = + collect(Arc::clone(&final_aggregate) as _, Arc::clone(&task_ctx)).await?; + + let metrics = final_aggregate.metrics().unwrap(); + assert_groupby_metrics(&metrics); + + Ok(()) + } +} diff --git a/datafusion/physical-plan/src/aggregates/group_values/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/mod.rs index 316fbe11ae31..5f2a2faa1112 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/mod.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/mod.rs @@ -46,8 +46,11 @@ use crate::aggregates::{ order::GroupOrdering, }; +mod metrics; mod null_builder; +pub(crate) use metrics::GroupByMetrics; + /// Stores the group values during hash aggregation. /// /// # Background diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index 6132a8b0add5..98c8cb235ca4 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -23,7 +23,7 @@ use std::vec; use super::order::GroupOrdering; use super::AggregateExec; -use crate::aggregates::group_values::{new_group_values, GroupValues}; +use crate::aggregates::group_values::{new_group_values, GroupByMetrics, GroupValues}; use crate::aggregates::order::GroupOrderingFull; use crate::aggregates::{ create_schema, evaluate_group_by, evaluate_many, evaluate_optional, AggregateMode, @@ -49,6 +49,7 @@ use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::{GroupsAccumulatorAdapter, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; +use datafusion_common::instant::Instant; use futures::ready; use futures::stream::{Stream, StreamExt}; use log::debug; @@ -430,6 +431,9 @@ pub(crate) struct GroupedHashAggregateStream { /// Execution metrics baseline_metrics: BaselineMetrics, + + /// Aggregation-specific metrics + group_by_metrics: GroupByMetrics, } impl GroupedHashAggregateStream { @@ -447,6 +451,7 @@ impl GroupedHashAggregateStream { let batch_size = context.session_config().batch_size(); let input = agg.input.execute(partition, Arc::clone(&context))?; let baseline_metrics = BaselineMetrics::new(&agg.metrics, partition); + let group_by_metrics = GroupByMetrics::new(&agg.metrics, partition); let timer = baseline_metrics.elapsed_compute().timer(); @@ -609,6 +614,7 @@ impl GroupedHashAggregateStream { current_group_indices: Default::default(), exec_state, baseline_metrics, + group_by_metrics, batch_size, group_ordering, input_done: false, @@ -830,12 +836,25 @@ impl GroupedHashAggregateStream { evaluate_group_by(&self.group_by, &batch)? }; + // Only create the timer if there are actual aggregate arguments to evaluate + let timer = match ( + self.spill_state.is_stream_merging, + self.spill_state.merging_aggregate_arguments.is_empty(), + self.aggregate_arguments.is_empty(), + ) { + (true, false, _) | (false, _, false) => { + Some(self.group_by_metrics.aggregate_arguments_time.timer()) + } + _ => None, + }; + // Evaluate the aggregation expressions. let input_values = if self.spill_state.is_stream_merging { evaluate_many(&self.spill_state.merging_aggregate_arguments, &batch)? } else { evaluate_many(&self.aggregate_arguments, &batch)? }; + drop(timer); // Evaluate the filter expressions, if any, against the inputs let filter_values = if self.spill_state.is_stream_merging { @@ -846,6 +865,8 @@ impl GroupedHashAggregateStream { }; for group_values in &group_by_values { + let groups_start_time = Instant::now(); + // calculate the group indices for each input row let starting_num_groups = self.group_values.len(); self.group_values @@ -862,6 +883,12 @@ impl GroupedHashAggregateStream { )?; } + // Use this instant for both measurements to save a syscall + let agg_start_time = Instant::now(); + self.group_by_metrics + .time_calculating_group_ids + .add_duration(agg_start_time - groups_start_time); + // Gather the inputs to call the actual accumulator let t = self .accumulators @@ -897,6 +924,9 @@ impl GroupedHashAggregateStream { acc.merge_batch(values, group_indices, None, total_num_groups)?; } } + self.group_by_metrics + .aggregation_time + .add_elapsed(agg_start_time); } } @@ -941,6 +971,7 @@ impl GroupedHashAggregateStream { return Ok(None); } + let timer = self.group_by_metrics.emitting_time.timer(); let mut output = self.group_values.emit(emit_to)?; if let EmitTo::First(n) = emit_to { self.group_ordering.remove_groups(n); @@ -961,12 +992,14 @@ impl GroupedHashAggregateStream { | AggregateMode::SinglePartitioned => output.push(acc.evaluate(emit_to)?), } } + drop(timer); // emit reduces the memory usage. Ignore Err from update_memory_reservation. Even if it is // over the target memory size after emission, we can emit again rather than returning Err. let _ = self.update_memory_reservation(); let batch = RecordBatch::try_new(schema, output)?; debug_assert!(batch.num_rows() > 0); + Ok(Some(batch)) } diff --git a/datafusion/physical-plan/src/aggregates/topk_stream.rs b/datafusion/physical-plan/src/aggregates/topk_stream.rs index 9aaadfd52b96..eb1b7543cbfd 100644 --- a/datafusion/physical-plan/src/aggregates/topk_stream.rs +++ b/datafusion/physical-plan/src/aggregates/topk_stream.rs @@ -17,11 +17,13 @@ //! A memory-conscious aggregation implementation that limits group buckets to a fixed number +use crate::aggregates::group_values::GroupByMetrics; use crate::aggregates::topk::priority_map::PriorityMap; use crate::aggregates::{ aggregate_expressions, evaluate_group_by, evaluate_many, AggregateExec, PhysicalGroupBy, }; +use crate::metrics::BaselineMetrics; use crate::{RecordBatchStream, SendableRecordBatchStream}; use arrow::array::{Array, ArrayRef, RecordBatch}; use arrow::datatypes::SchemaRef; @@ -42,6 +44,8 @@ pub struct GroupedTopKAggregateStream { started: bool, schema: SchemaRef, input: SendableRecordBatchStream, + baseline_metrics: BaselineMetrics, + group_by_metrics: GroupByMetrics, aggregate_arguments: Vec>>, group_by: PhysicalGroupBy, priority_map: PriorityMap, @@ -57,6 +61,8 @@ impl GroupedTopKAggregateStream { let agg_schema = Arc::clone(&aggr.schema); let group_by = aggr.group_by.clone(); let input = aggr.input.execute(partition, Arc::clone(&context))?; + let baseline_metrics = BaselineMetrics::new(&aggr.metrics, partition); + let group_by_metrics = GroupByMetrics::new(&aggr.metrics, partition); let aggregate_arguments = aggregate_expressions(&aggr.aggr_expr, &aggr.mode, group_by.expr.len())?; let (val_field, desc) = aggr @@ -75,6 +81,8 @@ impl GroupedTopKAggregateStream { row_count: 0, schema: agg_schema, input, + baseline_metrics, + group_by_metrics, aggregate_arguments, group_by, priority_map, @@ -90,6 +98,8 @@ impl RecordBatchStream for GroupedTopKAggregateStream { impl GroupedTopKAggregateStream { fn intern(&mut self, ids: ArrayRef, vals: ArrayRef) -> Result<()> { + let _timer = self.group_by_metrics.time_calculating_group_ids.timer(); + let len = ids.len(); self.priority_map.set_batch(ids, Arc::clone(&vals)); @@ -111,7 +121,10 @@ impl Stream for GroupedTopKAggregateStream { mut self: Pin<&mut Self>, cx: &mut Context<'_>, ) -> Poll> { + let elapsed_compute = self.baseline_metrics.elapsed_compute().clone(); + let emitting_time = self.group_by_metrics.emitting_time.clone(); while let Poll::Ready(res) = self.input.poll_next_unpin(cx) { + let _timer = elapsed_compute.timer(); match res { // got a batch, convert to rows and append to our TreeMap Some(Ok(batch)) => { @@ -140,10 +153,15 @@ impl Stream for GroupedTopKAggregateStream { "Exactly 1 group value required" ); let group_by_values = Arc::clone(&group_by_values[0][0]); - let input_values = evaluate_many( - &self.aggregate_arguments, - batches.first().unwrap(), - )?; + let input_values = { + let _timer = (!self.aggregate_arguments.is_empty()).then(|| { + self.group_by_metrics.aggregate_arguments_time.timer() + }); + evaluate_many( + &self.aggregate_arguments, + batches.first().unwrap(), + )? + }; assert_eq!(input_values.len(), 1, "Exactly 1 input required"); assert_eq!(input_values[0].len(), 1, "Exactly 1 input required"); let input_values = Arc::clone(&input_values[0][0]); @@ -157,8 +175,11 @@ impl Stream for GroupedTopKAggregateStream { trace!("partition {} emit None", self.partition); return Poll::Ready(None); } - let cols = self.priority_map.emit()?; - let batch = RecordBatch::try_new(Arc::clone(&self.schema), cols)?; + let batch = { + let _timer = emitting_time.timer(); + let cols = self.priority_map.emit()?; + RecordBatch::try_new(Arc::clone(&self.schema), cols)? + }; trace!( "partition {} emit batch with {} rows", self.partition, diff --git a/datafusion/physical-plan/src/metrics/baseline.rs b/datafusion/physical-plan/src/metrics/baseline.rs index 45cef58b5dd8..858773b94664 100644 --- a/datafusion/physical-plan/src/metrics/baseline.rs +++ b/datafusion/physical-plan/src/metrics/baseline.rs @@ -21,6 +21,8 @@ use std::task::Poll; use arrow::record_batch::RecordBatch; +use crate::spill::get_record_batch_memory_size; + use super::{Count, ExecutionPlanMetricsSet, MetricBuilder, Time, Timestamp}; use datafusion_common::Result; @@ -53,6 +55,16 @@ pub struct BaselineMetrics { /// output rows: the total output rows output_rows: Count, + + /// Memory usage of all output batches. + /// + /// Note: This value may be overestimated. If multiple output `RecordBatch` + /// instances share underlying memory buffers, their sizes will be counted + /// multiple times. + /// Issue: + output_bytes: Count, + // Remember to update `docs/source/user-guide/metrics.md` when updating comments + // or adding new metrics } impl BaselineMetrics { @@ -71,6 +83,9 @@ impl BaselineMetrics { output_rows: MetricBuilder::new(metrics) .with_type(super::MetricType::SUMMARY) .output_rows(partition), + output_bytes: MetricBuilder::new(metrics) + .with_type(super::MetricType::SUMMARY) + .output_bytes(partition), } } @@ -84,6 +99,7 @@ impl BaselineMetrics { end_time: Default::default(), elapsed_compute: self.elapsed_compute.clone(), output_rows: Default::default(), + output_bytes: Default::default(), } } @@ -211,6 +227,8 @@ impl RecordOutput for usize { impl RecordOutput for RecordBatch { fn record_output(self, bm: &BaselineMetrics) -> Self { bm.record_output(self.num_rows()); + let n_bytes = get_record_batch_memory_size(&self); + bm.output_bytes.add(n_bytes); self } } @@ -218,6 +236,8 @@ impl RecordOutput for RecordBatch { impl RecordOutput for &RecordBatch { fn record_output(self, bm: &BaselineMetrics) -> Self { bm.record_output(self.num_rows()); + let n_bytes = get_record_batch_memory_size(self); + bm.output_bytes.add(n_bytes); self } } diff --git a/datafusion/physical-plan/src/metrics/builder.rs b/datafusion/physical-plan/src/metrics/builder.rs index 74ba5a2a1834..bf59dccf6625 100644 --- a/datafusion/physical-plan/src/metrics/builder.rs +++ b/datafusion/physical-plan/src/metrics/builder.rs @@ -19,7 +19,7 @@ use std::{borrow::Cow, sync::Arc}; -use crate::metrics::MetricType; +use crate::metrics::{value::PruningMetrics, MetricType}; use super::{ Count, ExecutionPlanMetricsSet, Gauge, Label, Metric, MetricValue, Time, Timestamp, @@ -151,6 +151,14 @@ impl<'a> MetricBuilder<'a> { count } + /// Consume self and create a new counter for recording total output bytes + pub fn output_bytes(self, partition: usize) -> Count { + let count = Count::new(); + self.with_partition(partition) + .build(MetricValue::OutputBytes(count.clone())); + count + } + /// Consume self and create a new gauge for reporting current memory usage pub fn mem_used(self, partition: usize) -> Gauge { let gauge = Gauge::new(); @@ -242,4 +250,20 @@ impl<'a> MetricBuilder<'a> { .build(MetricValue::EndTimestamp(timestamp.clone())); timestamp } + + /// Consumes self and creates a new `PruningMetrics` + pub fn pruning_metrics( + self, + name: impl Into>, + partition: usize, + ) -> PruningMetrics { + let pruning_metrics = PruningMetrics::new(); + self.with_partition(partition) + .build(MetricValue::PruningMetrics { + name: name.into(), + // inner values will be `Arc::clone()` + pruning_metrics: pruning_metrics.clone(), + }); + pruning_metrics + } } diff --git a/datafusion/physical-plan/src/metrics/mod.rs b/datafusion/physical-plan/src/metrics/mod.rs index 0fd7bfb8c812..e66db8f0c911 100644 --- a/datafusion/physical-plan/src/metrics/mod.rs +++ b/datafusion/physical-plan/src/metrics/mod.rs @@ -35,7 +35,9 @@ use datafusion_common::HashMap; pub use baseline::{BaselineMetrics, RecordOutput, SpillMetrics, SplitMetrics}; pub use builder::MetricBuilder; pub use custom::CustomMetricValue; -pub use value::{Count, Gauge, MetricValue, ScopedTimerGuard, Time, Timestamp}; +pub use value::{ + Count, Gauge, MetricValue, PruningMetrics, ScopedTimerGuard, Time, Timestamp, +}; /// Something that tracks a value of interest (metric) of a DataFusion /// [`ExecutionPlan`] execution. @@ -296,11 +298,13 @@ impl MetricsSet { MetricValue::ElapsedCompute(_) => false, MetricValue::SpillCount(_) => false, MetricValue::SpilledBytes(_) => false, + MetricValue::OutputBytes(_) => false, MetricValue::SpilledRows(_) => false, MetricValue::CurrentMemoryUsage(_) => false, MetricValue::Gauge { name, .. } => name == metric_name, MetricValue::StartTimestamp(_) => false, MetricValue::EndTimestamp(_) => false, + MetricValue::PruningMetrics { .. } => false, MetricValue::Custom { .. } => false, }) } diff --git a/datafusion/physical-plan/src/metrics/value.rs b/datafusion/physical-plan/src/metrics/value.rs index 3149fca95ba8..3b8aa7a2bd34 100644 --- a/datafusion/physical-plan/src/metrics/value.rs +++ b/datafusion/physical-plan/src/metrics/value.rs @@ -362,6 +362,74 @@ impl Drop for ScopedTimerGuard<'_> { } } +/// Counters tracking pruning metrics +/// +/// For example, a file scanner initially is planned to scan 10 files, but skipped +/// 8 of them using statistics, the pruning metrics would look like: 10 total -> 2 matched +/// +/// Note `clone`ing update the same underlying metrics +#[derive(Debug, Clone)] +pub struct PruningMetrics { + pruned: Arc, + matched: Arc, +} + +impl Display for PruningMetrics { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let matched = self.matched.load(Ordering::Relaxed); + let total = self.pruned.load(Ordering::Relaxed) + matched; + + write!(f, "{total} total → {matched} matched") + } +} + +impl Default for PruningMetrics { + fn default() -> Self { + Self::new() + } +} + +impl PruningMetrics { + /// create a new PruningMetrics + pub fn new() -> Self { + Self { + pruned: Arc::new(AtomicUsize::new(0)), + matched: Arc::new(AtomicUsize::new(0)), + } + } + + /// Add `n` to the metric's pruned value + pub fn add_pruned(&self, n: usize) { + // relaxed ordering for operations on `value` poses no issues + // we're purely using atomic ops with no associated memory ops + self.pruned.fetch_add(n, Ordering::Relaxed); + } + + /// Add `n` to the metric's matched value + pub fn add_matched(&self, n: usize) { + // relaxed ordering for operations on `value` poses no issues + // we're purely using atomic ops with no associated memory ops + self.matched.fetch_add(n, Ordering::Relaxed); + } + + /// Subtract `n` to the metric's matched value. + pub fn subtract_matched(&self, n: usize) { + // relaxed ordering for operations on `value` poses no issues + // we're purely using atomic ops with no associated memory ops + self.matched.fetch_sub(n, Ordering::Relaxed); + } + + /// Number of items pruned + pub fn pruned(&self) -> usize { + self.pruned.load(Ordering::Relaxed) + } + + /// Number of items matched (not pruned) + pub fn matched(&self) -> usize { + self.matched.load(Ordering::Relaxed) + } +} + /// Possible values for a [super::Metric]. /// /// Among other differences, the metric types have different ways to @@ -395,6 +463,8 @@ pub enum MetricValue { SpillCount(Count), /// Total size of spilled bytes produced: "spilled_bytes" metric SpilledBytes(Count), + /// Total size of output bytes produced: "output_bytes" metric + OutputBytes(Count), /// Total size of spilled rows produced: "spilled_rows" metric SpilledRows(Count), /// Current memory used @@ -424,6 +494,11 @@ pub enum MetricValue { StartTimestamp(Timestamp), /// The time at which execution ended EndTimestamp(Timestamp), + /// Metrics related to scan pruning + PruningMetrics { + name: Cow<'static, str>, + pruning_metrics: PruningMetrics, + }, Custom { /// The provided name of this metric name: Cow<'static, str>, @@ -449,6 +524,9 @@ impl PartialEq for MetricValue { (MetricValue::SpilledBytes(count), MetricValue::SpilledBytes(other)) => { count == other } + (MetricValue::OutputBytes(count), MetricValue::OutputBytes(other)) => { + count == other + } (MetricValue::SpilledRows(count), MetricValue::SpilledRows(other)) => { count == other } @@ -505,6 +583,7 @@ impl MetricValue { Self::OutputRows(_) => "output_rows", Self::SpillCount(_) => "spill_count", Self::SpilledBytes(_) => "spilled_bytes", + Self::OutputBytes(_) => "output_bytes", Self::SpilledRows(_) => "spilled_rows", Self::CurrentMemoryUsage(_) => "mem_used", Self::ElapsedCompute(_) => "elapsed_compute", @@ -513,16 +592,19 @@ impl MetricValue { Self::Time { name, .. } => name.borrow(), Self::StartTimestamp(_) => "start_timestamp", Self::EndTimestamp(_) => "end_timestamp", + Self::PruningMetrics { name, .. } => name.borrow(), Self::Custom { name, .. } => name.borrow(), } } - /// Return the value of the metric as a usize value + /// Return the value of the metric as a usize value, used to aggregate metric + /// value across partitions. pub fn as_usize(&self) -> usize { match self { Self::OutputRows(count) => count.value(), Self::SpillCount(count) => count.value(), Self::SpilledBytes(bytes) => bytes.value(), + Self::OutputBytes(bytes) => bytes.value(), Self::SpilledRows(count) => count.value(), Self::CurrentMemoryUsage(used) => used.value(), Self::ElapsedCompute(time) => time.value(), @@ -539,6 +621,10 @@ impl MetricValue { .and_then(|ts| ts.timestamp_nanos_opt()) .map(|nanos| nanos as usize) .unwrap_or(0), + // This function is a utility for aggregating metrics, for complex metric + // like `PruningMetrics`, this function is not supposed to get called. + // Metrics aggregation for them are implemented inside `MetricsSet` directly. + Self::PruningMetrics { .. } => 0, Self::Custom { value, .. } => value.as_usize(), } } @@ -550,6 +636,7 @@ impl MetricValue { Self::OutputRows(_) => Self::OutputRows(Count::new()), Self::SpillCount(_) => Self::SpillCount(Count::new()), Self::SpilledBytes(_) => Self::SpilledBytes(Count::new()), + Self::OutputBytes(_) => Self::OutputBytes(Count::new()), Self::SpilledRows(_) => Self::SpilledRows(Count::new()), Self::CurrentMemoryUsage(_) => Self::CurrentMemoryUsage(Gauge::new()), Self::ElapsedCompute(_) => Self::ElapsedCompute(Time::new()), @@ -567,6 +654,10 @@ impl MetricValue { }, Self::StartTimestamp(_) => Self::StartTimestamp(Timestamp::new()), Self::EndTimestamp(_) => Self::EndTimestamp(Timestamp::new()), + Self::PruningMetrics { name, .. } => Self::PruningMetrics { + name: name.clone(), + pruning_metrics: PruningMetrics::new(), + }, Self::Custom { name, value } => Self::Custom { name: name.clone(), value: value.new_empty(), @@ -588,6 +679,7 @@ impl MetricValue { (Self::OutputRows(count), Self::OutputRows(other_count)) | (Self::SpillCount(count), Self::SpillCount(other_count)) | (Self::SpilledBytes(count), Self::SpilledBytes(other_count)) + | (Self::OutputBytes(count), Self::OutputBytes(other_count)) | (Self::SpilledRows(count), Self::SpilledRows(other_count)) | ( Self::Count { count, .. }, @@ -617,6 +709,20 @@ impl MetricValue { (Self::EndTimestamp(timestamp), Self::EndTimestamp(other_timestamp)) => { timestamp.update_to_max(other_timestamp); } + ( + Self::PruningMetrics { + pruning_metrics, .. + }, + Self::PruningMetrics { + pruning_metrics: other_pruning_metrics, + .. + }, + ) => { + let pruned = other_pruning_metrics.pruned.load(Ordering::Relaxed); + let matched = other_pruning_metrics.matched.load(Ordering::Relaxed); + pruning_metrics.add_pruned(pruned); + pruning_metrics.add_matched(matched); + } ( Self::Custom { value, .. }, Self::Custom { @@ -638,18 +744,22 @@ impl MetricValue { /// numbers are "more useful" (and displayed first) pub fn display_sort_key(&self) -> u8 { match self { - Self::OutputRows(_) => 0, // show first - Self::ElapsedCompute(_) => 1, // show second - Self::SpillCount(_) => 2, - Self::SpilledBytes(_) => 3, - Self::SpilledRows(_) => 4, - Self::CurrentMemoryUsage(_) => 5, - Self::Count { .. } => 6, - Self::Gauge { .. } => 7, - Self::Time { .. } => 8, - Self::StartTimestamp(_) => 9, // show timestamps last - Self::EndTimestamp(_) => 10, - Self::Custom { .. } => 11, + // `BaselineMetrics` that is common for most operators + Self::OutputRows(_) => 0, + Self::ElapsedCompute(_) => 1, + Self::OutputBytes(_) => 2, + // Other metrics + Self::PruningMetrics { .. } => 3, + Self::SpillCount(_) => 4, + Self::SpilledBytes(_) => 5, + Self::SpilledRows(_) => 6, + Self::CurrentMemoryUsage(_) => 7, + Self::Count { .. } => 8, + Self::Gauge { .. } => 9, + Self::Time { .. } => 10, + Self::StartTimestamp(_) => 11, // show timestamps last + Self::EndTimestamp(_) => 12, + Self::Custom { .. } => 13, } } @@ -669,7 +779,7 @@ impl Display for MetricValue { | Self::Count { count, .. } => { write!(f, "{count}") } - Self::SpilledBytes(count) => { + Self::SpilledBytes(count) | Self::OutputBytes(count) => { let readable_count = human_readable_size(count.value()); write!(f, "{readable_count}") } @@ -688,6 +798,11 @@ impl Display for MetricValue { Self::StartTimestamp(timestamp) | Self::EndTimestamp(timestamp) => { write!(f, "{timestamp}") } + Self::PruningMetrics { + pruning_metrics, .. + } => { + write!(f, "{pruning_metrics}") + } Self::Custom { name, value } => { write!(f, "name:{name} {value}") } diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index 4dc88bc56631..2c84570b33d9 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -53,7 +53,9 @@ use datafusion_physical_expr_common::physical_expr::{fmt_sql, PhysicalExprRef}; use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement}; // Re-exported from datafusion-physical-expr for backwards compatibility // We recommend updating your imports to use datafusion-physical-expr directly -pub use datafusion_physical_expr::projection::{update_expr, Projection, ProjectionExpr}; +pub use datafusion_physical_expr::projection::{ + update_expr, ProjectionExpr, ProjectionExprs, +}; use futures::stream::{Stream, StreamExt}; use log::trace; @@ -65,7 +67,7 @@ use log::trace; #[derive(Debug, Clone)] pub struct ProjectionExec { /// The projection expressions stored as tuples of (expression, output column name) - projection: Projection, + projection: ProjectionExprs, /// The schema once the projection has been applied to the input schema: SchemaRef, /// The input plan @@ -130,7 +132,7 @@ impl ProjectionExec { let input_schema = input.schema(); // convert argument to Vec let expr_vec = expr.into_iter().map(Into::into).collect::>(); - let projection = Projection::new(expr_vec); + let projection = ProjectionExprs::new(expr_vec); let schema = Arc::new(projection.project_schema(&input_schema)?); diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 891fd0ae4851..a76316369ec7 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -1696,7 +1696,7 @@ mod tests { // Get string representation of the plan assert_snapshot!(displayable(physical_plan.as_ref()).indent(true), @r#" - BoundedWindowAggExec: wdw=[last: Field { name: "last", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(-1): Field { name: "nth_value(-1)", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(-2): Field { name: "nth_value(-2)", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[last: Field { "last": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(-1): Field { "nth_value(-1)": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(-2): Field { "nth_value(-2)": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: partitions=1, partition_sizes=[3] "#); @@ -1814,7 +1814,7 @@ mod tests { // Get string representation of the plan assert_snapshot!(displayable(plan.as_ref()).indent(true), @r#" ProjectionExec: expr=[sn@0 as sn, hash@1 as hash, count([Column { name: "sn", index: 0 }]) PARTITION BY: [[Column { name: "hash", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: "sn", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]@2 as col_2] - BoundedWindowAggExec: wdw=[count([Column { name: "sn", index: 0 }]) PARTITION BY: [[Column { name: "hash", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: "sn", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]: Field { name: "count([Column { name: \"sn\", index: 0 }]) PARTITION BY: [[Column { name: \"hash\", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: \"sn\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[Linear] + BoundedWindowAggExec: wdw=[count([Column { name: "sn", index: 0 }]) PARTITION BY: [[Column { name: "hash", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: "sn", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]: Field { "count([Column { name: \"sn\", index: 0 }]) PARTITION BY: [[Column { name: \"hash\", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: \"sn\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]": Int64 }, frame: RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[Linear] StreamingTableExec: partition_sizes=1, projection=[sn, hash], infinite_source=true, output_ordering=[sn@0 ASC NULLS LAST] "#); diff --git a/datafusion/proto-common/src/lib.rs b/datafusion/proto-common/src/lib.rs index 9efb234e3994..b0061168c5ce 100644 --- a/datafusion/proto-common/src/lib.rs +++ b/datafusion/proto-common/src/lib.rs @@ -62,28 +62,33 @@ //! # use datafusion_proto_common::protobuf_common; //! # use prost::Message; //! # fn main() -> Result<()>{ -//! // Create a new ScalarValue -//! let val = ScalarValue::UInt64(Some(3)); -//! let mut buffer = BytesMut::new(); -//! let protobuf: protobuf_common::ScalarValue = match val { -//! ScalarValue::UInt64(Some(val)) => { -//! protobuf_common::ScalarValue{value: Some(protobuf_common::scalar_value::Value::Uint64Value(val))} -//! } -//! _ => unreachable!(), -//! }; +//! // Create a new ScalarValue +//! let val = ScalarValue::UInt64(Some(3)); +//! let mut buffer = BytesMut::new(); +//! let protobuf: protobuf_common::ScalarValue = match val { +//! ScalarValue::UInt64(Some(val)) => protobuf_common::ScalarValue { +//! value: Some(protobuf_common::scalar_value::Value::Uint64Value(val)), +//! }, +//! _ => unreachable!(), +//! }; //! -//! protobuf.encode(&mut buffer) +//! protobuf +//! .encode(&mut buffer) //! .map_err(|e| plan_datafusion_err!("Error encoding protobuf as bytes: {e}"))?; -//! // Convert it to bytes (for sending over the network, etc.) -//! let bytes: Bytes = buffer.into(); +//! // Convert it to bytes (for sending over the network, etc.) +//! let bytes: Bytes = buffer.into(); //! -//! let protobuf = protobuf_common::ScalarValue::decode(bytes).map_err(|e| plan_datafusion_err!("Error decoding ScalarValue as protobuf: {e}"))?; -//! // Decode bytes from somewhere (over network, etc.) back to ScalarValue -//! let decoded_val: ScalarValue = match protobuf.value { -//! Some(protobuf_common::scalar_value::Value::Uint64Value(val)) => ScalarValue::UInt64(Some(val)), -//! _ => unreachable!(), -//! }; -//! assert_eq!(val, decoded_val); +//! let protobuf = protobuf_common::ScalarValue::decode(bytes).map_err(|e| { +//! plan_datafusion_err!("Error decoding ScalarValue as protobuf: {e}") +//! })?; +//! // Decode bytes from somewhere (over network, etc.) back to ScalarValue +//! let decoded_val: ScalarValue = match protobuf.value { +//! Some(protobuf_common::scalar_value::Value::Uint64Value(val)) => { +//! ScalarValue::UInt64(Some(val)) +//! } +//! _ => unreachable!(), +//! }; +//! assert_eq!(val, decoded_val); //! # Ok(()) //! # } //! ``` diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 8e4131479e50..e9de1d9e9a9e 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -28,7 +28,9 @@ use arrow::datatypes::{ DataType, Field, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema, SchemaRef, TimeUnit, UnionMode, }; -use arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator}; +use arrow::ipc::writer::{ + CompressionContext, DictionaryTracker, IpcDataGenerator, IpcWriteOptions, +}; use datafusion_common::{ config::{ CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions, @@ -1018,8 +1020,15 @@ fn encode_scalar_nested_value( let gen = IpcDataGenerator {}; let mut dict_tracker = DictionaryTracker::new(false); + let write_options = IpcWriteOptions::default(); + let mut compression_context = CompressionContext::default(); let (encoded_dictionaries, encoded_message) = gen - .encoded_batch(&batch, &mut dict_tracker, &Default::default()) + .encode( + &batch, + &mut dict_tracker, + &write_options, + &mut compression_context, + ) .map_err(|e| { Error::General(format!("Error encoding ScalarValue::List as IPC: {e}")) })?; diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 11103472ae2a..f9400d14a59c 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -179,8 +179,11 @@ message CreateExternalTableNode { message PrepareNode { string name = 1; + // We serialize both the data types and the fields for compatibility with + // older versions (newer versions populate both). repeated datafusion_common.ArrowType data_types = 2; LogicalPlanNode input = 3; + repeated datafusion_common.Field fields = 4; } message CreateCatalogSchemaNode { @@ -412,7 +415,11 @@ message Wildcard { message PlaceholderNode { string id = 1; + // We serialize the data type, metadata, and nullability separately to maintain + // compatibility with older versions datafusion_common.ArrowType data_type = 2; + optional bool nullable = 3; + map metadata = 4; } message LogicalExprList { diff --git a/datafusion/proto/src/bytes/mod.rs b/datafusion/proto/src/bytes/mod.rs index 12d9938373ce..6eab2239015a 100644 --- a/datafusion/proto/src/bytes/mod.rs +++ b/datafusion/proto/src/bytes/mod.rs @@ -313,7 +313,7 @@ pub fn physical_plan_from_json( let back: protobuf::PhysicalPlanNode = serde_json::from_str(json) .map_err(|e| plan_datafusion_err!("Error serializing plan: {e}"))?; let extension_codec = DefaultPhysicalExtensionCodec {}; - back.try_into_physical_plan(&ctx, &extension_codec) + back.try_into_physical_plan(ctx, &extension_codec) } /// Deserialize a PhysicalPlan from bytes diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index b34da2c312de..4cf834d0601e 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -18434,6 +18434,12 @@ impl serde::Serialize for PlaceholderNode { if self.data_type.is_some() { len += 1; } + if self.nullable.is_some() { + len += 1; + } + if !self.metadata.is_empty() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion.PlaceholderNode", len)?; if !self.id.is_empty() { struct_ser.serialize_field("id", &self.id)?; @@ -18441,6 +18447,12 @@ impl serde::Serialize for PlaceholderNode { if let Some(v) = self.data_type.as_ref() { struct_ser.serialize_field("dataType", v)?; } + if let Some(v) = self.nullable.as_ref() { + struct_ser.serialize_field("nullable", v)?; + } + if !self.metadata.is_empty() { + struct_ser.serialize_field("metadata", &self.metadata)?; + } struct_ser.end() } } @@ -18454,12 +18466,16 @@ impl<'de> serde::Deserialize<'de> for PlaceholderNode { "id", "data_type", "dataType", + "nullable", + "metadata", ]; #[allow(clippy::enum_variant_names)] enum GeneratedField { Id, DataType, + Nullable, + Metadata, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -18483,6 +18499,8 @@ impl<'de> serde::Deserialize<'de> for PlaceholderNode { match value { "id" => Ok(GeneratedField::Id), "dataType" | "data_type" => Ok(GeneratedField::DataType), + "nullable" => Ok(GeneratedField::Nullable), + "metadata" => Ok(GeneratedField::Metadata), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -18504,6 +18522,8 @@ impl<'de> serde::Deserialize<'de> for PlaceholderNode { { let mut id__ = None; let mut data_type__ = None; + let mut nullable__ = None; + let mut metadata__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::Id => { @@ -18518,11 +18538,27 @@ impl<'de> serde::Deserialize<'de> for PlaceholderNode { } data_type__ = map_.next_value()?; } + GeneratedField::Nullable => { + if nullable__.is_some() { + return Err(serde::de::Error::duplicate_field("nullable")); + } + nullable__ = map_.next_value()?; + } + GeneratedField::Metadata => { + if metadata__.is_some() { + return Err(serde::de::Error::duplicate_field("metadata")); + } + metadata__ = Some( + map_.next_value::>()? + ); + } } } Ok(PlaceholderNode { id: id__.unwrap_or_default(), data_type: data_type__, + nullable: nullable__, + metadata: metadata__.unwrap_or_default(), }) } } @@ -18889,6 +18925,9 @@ impl serde::Serialize for PrepareNode { if self.input.is_some() { len += 1; } + if !self.fields.is_empty() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion.PrepareNode", len)?; if !self.name.is_empty() { struct_ser.serialize_field("name", &self.name)?; @@ -18899,6 +18938,9 @@ impl serde::Serialize for PrepareNode { if let Some(v) = self.input.as_ref() { struct_ser.serialize_field("input", v)?; } + if !self.fields.is_empty() { + struct_ser.serialize_field("fields", &self.fields)?; + } struct_ser.end() } } @@ -18913,6 +18955,7 @@ impl<'de> serde::Deserialize<'de> for PrepareNode { "data_types", "dataTypes", "input", + "fields", ]; #[allow(clippy::enum_variant_names)] @@ -18920,6 +18963,7 @@ impl<'de> serde::Deserialize<'de> for PrepareNode { Name, DataTypes, Input, + Fields, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -18944,6 +18988,7 @@ impl<'de> serde::Deserialize<'de> for PrepareNode { "name" => Ok(GeneratedField::Name), "dataTypes" | "data_types" => Ok(GeneratedField::DataTypes), "input" => Ok(GeneratedField::Input), + "fields" => Ok(GeneratedField::Fields), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -18966,6 +19011,7 @@ impl<'de> serde::Deserialize<'de> for PrepareNode { let mut name__ = None; let mut data_types__ = None; let mut input__ = None; + let mut fields__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::Name => { @@ -18986,12 +19032,19 @@ impl<'de> serde::Deserialize<'de> for PrepareNode { } input__ = map_.next_value()?; } + GeneratedField::Fields => { + if fields__.is_some() { + return Err(serde::de::Error::duplicate_field("fields")); + } + fields__ = Some(map_.next_value()?); + } } } Ok(PrepareNode { name: name__.unwrap_or_default(), data_types: data_types__.unwrap_or_default(), input: input__, + fields: fields__.unwrap_or_default(), }) } } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 2e1c482db65c..12b417627411 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -278,10 +278,14 @@ pub struct CreateExternalTableNode { pub struct PrepareNode { #[prost(string, tag = "1")] pub name: ::prost::alloc::string::String, + /// We serialize both the data types and the fields for compatibility with + /// older versions (newer versions populate both). #[prost(message, repeated, tag = "2")] pub data_types: ::prost::alloc::vec::Vec, #[prost(message, optional, boxed, tag = "3")] pub input: ::core::option::Option<::prost::alloc::boxed::Box>, + #[prost(message, repeated, tag = "4")] + pub fields: ::prost::alloc::vec::Vec, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct CreateCatalogSchemaNode { @@ -651,8 +655,17 @@ pub struct Wildcard { pub struct PlaceholderNode { #[prost(string, tag = "1")] pub id: ::prost::alloc::string::String, + /// We serialize the data type, metadata, and nullability separately to maintain + /// compatibility with older versions #[prost(message, optional, tag = "2")] pub data_type: ::core::option::Option, + #[prost(bool, optional, tag = "3")] + pub nullable: ::core::option::Option, + #[prost(map = "string, string", tag = "4")] + pub metadata: ::std::collections::HashMap< + ::prost::alloc::string::String, + ::prost::alloc::string::String, + >, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct LogicalExprList { diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index b1590b9ad2aa..b16b12bc0516 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -64,15 +64,15 @@ //! # use datafusion_expr::{col, lit, Expr}; //! # use datafusion_proto::bytes::Serializeable; //! # fn main() -> Result<()>{ -//! // Create a new `Expr` a < 32 -//! let expr = col("a").lt(lit(5i32)); +//! // Create a new `Expr` a < 32 +//! let expr = col("a").lt(lit(5i32)); //! -//! // Convert it to bytes (for sending over the network, etc.) -//! let bytes = expr.to_bytes()?; +//! // Convert it to bytes (for sending over the network, etc.) +//! let bytes = expr.to_bytes()?; //! -//! // Decode bytes from somewhere (over network, etc.) back to Expr -//! let decoded_expr = Expr::from_bytes(&bytes)?; -//! assert_eq!(expr, decoded_expr); +//! // Decode bytes from somewhere (over network, etc.) back to Expr +//! let decoded_expr = Expr::from_bytes(&bytes)?; +//! assert_eq!(expr, decoded_expr); //! # Ok(()) //! # } //! ``` diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 507a0cec9d88..598a77f5420e 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -17,6 +17,7 @@ use std::sync::Arc; +use arrow::datatypes::Field; use datafusion_common::{ exec_datafusion_err, internal_err, plan_datafusion_err, NullEquality, RecursionUnnestOption, Result, ScalarValue, TableReference, UnnestOptions, @@ -626,12 +627,25 @@ pub fn parse_expr( ExprType::Rollup(RollupNode { expr }) => Ok(Expr::GroupingSet( GroupingSet::Rollup(parse_exprs(expr, registry, codec)?), )), - ExprType::Placeholder(PlaceholderNode { id, data_type }) => match data_type { - None => Ok(Expr::Placeholder(Placeholder::new(id.clone(), None))), - Some(data_type) => Ok(Expr::Placeholder(Placeholder::new( + ExprType::Placeholder(PlaceholderNode { + id, + data_type, + nullable, + metadata, + }) => match data_type { + None => Ok(Expr::Placeholder(Placeholder::new_with_field( id.clone(), - Some(data_type.try_into()?), + None, ))), + Some(data_type) => { + let field = + Field::new("", data_type.try_into()?, nullable.unwrap_or(true)) + .with_metadata(metadata.clone()); + Ok(Expr::Placeholder(Placeholder::new_with_field( + id.clone(), + Some(field.into()), + ))) + } }, } } diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index ad5618516606..9644c9f69fea 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -33,7 +33,7 @@ use crate::{ }; use crate::protobuf::{proto_error, ToProtoError}; -use arrow::datatypes::{DataType, Schema, SchemaBuilder, SchemaRef}; +use arrow::datatypes::{DataType, Field, Schema, SchemaBuilder, SchemaRef}; use datafusion_catalog::cte_worktable::CteWorkTable; use datafusion_common::file_options::file_type::FileType; use datafusion_common::{ @@ -877,9 +877,33 @@ impl AsLogicalPlan for LogicalPlanNode { .iter() .map(DataType::try_from) .collect::>()?; - LogicalPlanBuilder::from(input) - .prepare(prepare.name.clone(), data_types)? - .build() + let fields: Vec = prepare + .fields + .iter() + .map(Field::try_from) + .collect::>()?; + + // If the fields are empty this may have been generated by an + // earlier version of DataFusion, in which case the DataTypes + // can be used to construct the plan. + if fields.is_empty() { + LogicalPlanBuilder::from(input) + .prepare( + prepare.name.clone(), + data_types + .into_iter() + .map(|dt| Field::new("", dt, true).into()) + .collect(), + )? + .build() + } else { + LogicalPlanBuilder::from(input) + .prepare( + prepare.name.clone(), + fields.into_iter().map(|f| f.into()).collect(), + )? + .build() + } } LogicalPlanType::DropView(dropview) => { Ok(LogicalPlan::Ddl(DdlStatement::DropView(DropView { @@ -1610,7 +1634,7 @@ impl AsLogicalPlan for LogicalPlanNode { } LogicalPlan::Statement(Statement::Prepare(Prepare { name, - data_types, + fields, input, })) => { let input = @@ -1619,11 +1643,17 @@ impl AsLogicalPlan for LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::Prepare(Box::new( protobuf::PrepareNode { name: name.clone(), - data_types: data_types + input: Some(Box::new(input)), + // Store the DataTypes for reading by older DataFusion + data_types: fields .iter() - .map(|t| t.try_into()) + .map(|f| f.data_type().try_into()) + .collect::, _>>()?, + // Store the Fields for current and future DataFusion + fields: fields + .iter() + .map(|f| f.as_ref().try_into()) .collect::, _>>()?, - input: Some(Box::new(input)), }, ))), }) diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 6238c2f1cdde..2774b5b6ba7c 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -608,18 +608,20 @@ pub fn serialize_expr( })), } } - Expr::Placeholder(Placeholder { id, data_type }) => { - let data_type = match data_type { - Some(data_type) => Some(data_type.try_into()?), - None => None, - }; - protobuf::LogicalExprNode { - expr_type: Some(ExprType::Placeholder(PlaceholderNode { - id: id.clone(), - data_type, - })), - } - } + Expr::Placeholder(Placeholder { id, field }) => protobuf::LogicalExprNode { + expr_type: Some(ExprType::Placeholder(PlaceholderNode { + id: id.clone(), + data_type: match field { + Some(field) => Some(field.data_type().try_into()?), + None => None, + }, + nullable: field.as_ref().map(|f| f.is_nullable()), + metadata: field + .as_ref() + .map(|f| f.metadata().clone()) + .unwrap_or(HashMap::new()), + })), + }, }; Ok(expr_node) diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 7c4b9e55b813..2a3906d49347 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -545,7 +545,7 @@ pub fn parse_protobuf_file_scan_config( .with_file_groups(file_groups) .with_constraints(constraints) .with_statistics(statistics) - .with_projection(Some(projection)) + .with_projection_indices(Some(projection)) .with_limit(proto.limit.as_ref().map(|sl| sl.limit as usize)) .with_table_partition_cols(table_partition_cols) .with_output_ordering(output_ordering) diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index e5f4a1f7d026..0ebbb373f2d1 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -1940,7 +1940,8 @@ impl protobuf::PhysicalPlanNode { }; let table = GenerateSeriesTable::new(Arc::clone(&schema), args); - let generator = table.as_generator(generate_series.target_batch_size as usize)?; + let generator = + table.as_generator(generate_series.target_batch_size as usize, None)?; Ok(Arc::new(LazyMemoryExec::try_new(schema, vec![generator])?)) } diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 399c234191aa..dc0a78dbccf1 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -532,9 +532,10 @@ pub fn serialize_file_scan_config( statistics: Some((&conf.file_source.statistics().unwrap()).into()), limit: conf.limit.map(|l| protobuf::ScanLimit { limit: l as u32 }), projection: conf - .projection + .projection_exprs .as_ref() - .unwrap_or(&(0..schema.fields().len()).collect::>()) + .map(|p| p.column_indices()) + .unwrap_or((0..schema.fields().len()).collect::>()) .iter() .map(|n| *n as u32) .collect(), diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 18cd8b8e668b..bfd693e6a0f8 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -1075,6 +1075,35 @@ async fn roundtrip_logical_plan_with_view_scan() -> Result<()> { Ok(()) } +#[tokio::test] +async fn roundtrip_logical_plan_prepared_statement_with_metadata() -> Result<()> { + let ctx = SessionContext::new(); + + let plan = ctx + .sql("SELECT $1") + .await + .unwrap() + .into_optimized_plan() + .unwrap(); + let prepared = LogicalPlanBuilder::new(plan) + .prepare( + "".to_string(), + vec![Field::new("", DataType::Int32, true) + .with_metadata( + [("some_key".to_string(), "some_value".to_string())].into(), + ) + .into()], + ) + .unwrap() + .plan() + .clone(); + + let bytes = logical_plan_to_bytes(&prepared)?; + let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx.task_ctx())?; + assert_eq!(format!("{prepared}"), format!("{logical_round_trip}")); + Ok(()) +} + pub mod proto { #[derive(Clone, PartialEq, ::prost::Message)] pub struct TopKPlanProto { diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index a0456e2031be..c8b2bc02e447 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -920,7 +920,7 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { schema, file_source, ) - .with_projection(Some(vec![0, 1])) + .with_projection_indices(Some(vec![0, 1])) .with_file_group(FileGroup::new(vec![file_group])) .with_table_partition_cols(vec![Field::new( "part".to_string(), @@ -1814,7 +1814,7 @@ async fn roundtrip_projection_source() -> Result<()> { 1024, )])]) .with_statistics(statistics) - .with_projection(Some(vec![0, 1, 2])) + .with_projection_indices(Some(vec![0, 1, 2])) .build(); let filter = Arc::new( diff --git a/datafusion/proto/tests/cases/serialize.rs b/datafusion/proto/tests/cases/serialize.rs index 3d6918366885..f45a62e94874 100644 --- a/datafusion/proto/tests/cases/serialize.rs +++ b/datafusion/proto/tests/cases/serialize.rs @@ -18,10 +18,11 @@ use std::sync::Arc; use arrow::array::ArrayRef; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, Field}; use datafusion::execution::FunctionRegistry; use datafusion::prelude::SessionContext; +use datafusion_expr::expr::Placeholder; use datafusion_expr::{col, create_udf, lit, ColumnarValue}; use datafusion_expr::{Expr, Volatility}; use datafusion_functions::string; @@ -136,6 +137,21 @@ fn roundtrip_qualified_alias() { assert_eq!(qual_alias, roundtrip_expr(&qual_alias)); } +#[test] +fn roundtrip_placeholder_with_metadata() { + let expr = Expr::Placeholder(Placeholder::new_with_field( + "placeholder_id".to_string(), + Some( + Field::new("", DataType::Utf8, false) + .with_metadata( + [("some_key".to_string(), "some_value".to_string())].into(), + ) + .into(), + ), + )); + assert_eq!(expr, roundtrip_expr(&expr)); +} + #[test] fn roundtrip_deeply_nested_binary_expr() { // We need more stack space so this doesn't overflow in dev builds diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs index fa3454ce5644..380ada10df6e 100644 --- a/datafusion/pruning/src/pruning_predicate.rs +++ b/datafusion/pruning/src/pruning_predicate.rs @@ -882,7 +882,7 @@ impl From> for RequiredColumns { /// ```text /// ("s1", Min, Field:s1_min) /// ("s2", Max, field:s2_max) -///``` +/// ``` /// /// And the input statistics had /// ```text @@ -5108,7 +5108,6 @@ mod tests { /// /// `expected` is a vector of bools, where true means the row group should /// be kept, and false means it should be pruned. - /// // TODO refactor other tests to use this to reduce boiler plate fn prune_with_expr( expr: Expr, diff --git a/datafusion/session/src/session.rs b/datafusion/session/src/session.rs index de23dba491fd..fd033172f224 100644 --- a/datafusion/session/src/session.rs +++ b/datafusion/session/src/session.rs @@ -57,9 +57,12 @@ use std::sync::{Arc, Weak}; /// // Given a `Session` reference, get the concrete `SessionState` reference /// // Note: this may stop working in future versions, /// fn session_state_from_session(session: &dyn Session) -> Result<&SessionState> { -/// session.as_any() -/// .downcast_ref::() -/// .ok_or_else(|| exec_datafusion_err!("Failed to downcast Session to SessionState")) +/// session +/// .as_any() +/// .downcast_ref::() +/// .ok_or_else(|| { +/// exec_datafusion_err!("Failed to downcast Session to SessionState") +/// }) /// } /// ``` /// diff --git a/datafusion/spark/src/function/bitmap/bitmap_count.rs b/datafusion/spark/src/function/bitmap/bitmap_count.rs index 15bd33229a3d..56a9c5edb812 100644 --- a/datafusion/spark/src/function/bitmap/bitmap_count.rs +++ b/datafusion/spark/src/function/bitmap/bitmap_count.rs @@ -19,13 +19,13 @@ use std::any::Any; use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, BinaryArray, BinaryViewArray, FixedSizeBinaryArray, Int64Array, - LargeBinaryArray, + as_dictionary_array, Array, ArrayRef, BinaryArray, BinaryViewArray, + FixedSizeBinaryArray, Int64Array, LargeBinaryArray, }; -use arrow::datatypes::DataType; use arrow::datatypes::DataType::{ - Binary, BinaryView, FixedSizeBinary, Int64, LargeBinary, + Binary, BinaryView, Dictionary, FixedSizeBinary, LargeBinary, }; +use arrow::datatypes::{DataType, Int16Type, Int32Type, Int64Type, Int8Type}; use datafusion_common::utils::take_function_args; use datafusion_common::{internal_err, Result}; use datafusion_expr::{ @@ -71,7 +71,7 @@ impl ScalarUDFImpl for BitmapCount { } fn return_type(&self, _arg_types: &[DataType]) -> Result { - Ok(Int64) + Ok(DataType::Int64) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { @@ -90,6 +90,17 @@ macro_rules! downcast_and_count_ones { }}; } +macro_rules! downcast_dict_and_count_ones { + ($input_dict:expr, $key_array_type:ident) => {{ + let dict_array = as_dictionary_array::<$key_array_type>($input_dict); + let array = dict_array.downcast_dict::().unwrap(); + Ok(array + .into_iter() + .map(binary_count_ones) + .collect::()) + }}; +} + pub fn bitmap_count_inner(arg: &[ArrayRef]) -> Result { let [input_array] = take_function_args("bitmap_count", arg)?; @@ -100,6 +111,17 @@ pub fn bitmap_count_inner(arg: &[ArrayRef]) -> Result { FixedSizeBinary(_size) => { downcast_and_count_ones!(input_array, FixedSizeBinaryArray) } + Dictionary(k, v) if v.as_ref() == &Binary => match k.as_ref() { + DataType::Int8 => downcast_dict_and_count_ones!(input_array, Int8Type), + DataType::Int16 => downcast_dict_and_count_ones!(input_array, Int16Type), + DataType::Int32 => downcast_dict_and_count_ones!(input_array, Int32Type), + DataType::Int64 => downcast_dict_and_count_ones!(input_array, Int64Type), + data_type => { + internal_err!( + "bitmap_count does not support Dictionary({data_type}, Binary)" + ) + } + }, data_type => { internal_err!("bitmap_count does not support {data_type}") } @@ -114,8 +136,12 @@ mod tests { use crate::function::utils::test::test_scalar_function; use arrow::array::{Array, Int64Array}; use arrow::datatypes::DataType::Int64; + use arrow::datatypes::{DataType, Field}; + use datafusion_common::config::ConfigOptions; use datafusion_common::{Result, ScalarValue}; - use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + use datafusion_expr::ColumnarValue::Scalar; + use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; + use std::sync::Arc; macro_rules! test_bitmap_count_binary_invoke { ($INPUT:expr, $EXPECTED:expr) => { @@ -171,4 +197,31 @@ mod tests { ); Ok(()) } + + #[test] + fn test_dictionary_encoded_bitmap_count_invoke() -> Result<()> { + let dict = Scalar(ScalarValue::Dictionary( + Box::new(DataType::Int32), + Box::new(ScalarValue::Binary(Some(vec![0xFFu8, 0xFFu8]))), + )); + + let arg_fields = vec![Field::new( + "a", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Binary)), + true, + ) + .into()]; + let args = ScalarFunctionArgs { + args: vec![dict.clone()], + arg_fields, + number_rows: 1, + return_field: Field::new("f", Int64, true).into(), + config_options: Arc::new(ConfigOptions::default()), + }; + let udf = BitmapCount::new(); + let actual = udf.invoke_with_args(args)?; + let expect = Scalar(ScalarValue::Int64(Some(16))); + assert_eq!(*actual.into_array(1)?, *expect.into_array(1)?); + Ok(()) + } } diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index eabf645a5eaf..cb34bb0f7eb7 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -274,8 +274,28 @@ impl SqlToRel<'_, S> { } // User-defined function (UDF) should have precedence if let Some(fm) = self.context_provider.get_function_meta(&name) { - let args = self.function_args_to_expr(args, schema, planner_context)?; - let inner = ScalarFunction::new_udf(fm, args); + let (args, arg_names) = + self.function_args_to_expr_with_names(args, schema, planner_context)?; + + let resolved_args = if arg_names.iter().any(|name| name.is_some()) { + if let Some(param_names) = &fm.signature().parameter_names { + datafusion_expr::arguments::resolve_function_arguments( + param_names, + args, + arg_names, + )? + } else { + return plan_err!( + "Function '{}' does not support named arguments", + fm.name() + ); + } + } else { + args + }; + + // After resolution, all arguments are positional + let inner = ScalarFunction::new_udf(fm, resolved_args); if name.eq_ignore_ascii_case(inner.name()) { return Ok(Expr::ScalarFunction(inner)); @@ -624,14 +644,29 @@ impl SqlToRel<'_, S> { schema: &DFSchema, planner_context: &mut PlannerContext, ) -> Result { + let (expr, _) = + self.sql_fn_arg_to_logical_expr_with_name(sql, schema, planner_context)?; + Ok(expr) + } + + fn sql_fn_arg_to_logical_expr_with_name( + &self, + sql: FunctionArg, + schema: &DFSchema, + planner_context: &mut PlannerContext, + ) -> Result<(Expr, Option)> { match sql { FunctionArg::Named { - name: _, + name, arg: FunctionArgExpr::Expr(arg), operator: _, - } => self.sql_expr_to_logical_expr(arg, schema, planner_context), + } => { + let expr = self.sql_expr_to_logical_expr(arg, schema, planner_context)?; + let arg_name = crate::utils::normalize_ident(name); + Ok((expr, Some(arg_name))) + } FunctionArg::Named { - name: _, + name, arg: FunctionArgExpr::Wildcard, operator: _, } => { @@ -640,11 +675,12 @@ impl SqlToRel<'_, S> { qualifier: None, options: Box::new(WildcardOptions::default()), }; - - Ok(expr) + let arg_name = crate::utils::normalize_ident(name); + Ok((expr, Some(arg_name))) } FunctionArg::Unnamed(FunctionArgExpr::Expr(arg)) => { - self.sql_expr_to_logical_expr(arg, schema, planner_context) + let expr = self.sql_expr_to_logical_expr(arg, schema, planner_context)?; + Ok((expr, None)) } FunctionArg::Unnamed(FunctionArgExpr::Wildcard) => { #[expect(deprecated)] @@ -652,8 +688,7 @@ impl SqlToRel<'_, S> { qualifier: None, options: Box::new(WildcardOptions::default()), }; - - Ok(expr) + Ok((expr, None)) } FunctionArg::Unnamed(FunctionArgExpr::QualifiedWildcard(object_name)) => { let qualifier = self.object_name_to_table_reference(object_name)?; @@ -668,8 +703,30 @@ impl SqlToRel<'_, S> { qualifier: qualifier.into(), options: Box::new(WildcardOptions::default()), }; - - Ok(expr) + Ok((expr, None)) + } + // PostgreSQL dialect uses ExprNamed variant with expression for name + FunctionArg::ExprNamed { + name: SQLExpr::Identifier(name), + arg: FunctionArgExpr::Expr(arg), + operator: _, + } => { + let expr = self.sql_expr_to_logical_expr(arg, schema, planner_context)?; + let arg_name = crate::utils::normalize_ident(name); + Ok((expr, Some(arg_name))) + } + FunctionArg::ExprNamed { + name: SQLExpr::Identifier(name), + arg: FunctionArgExpr::Wildcard, + operator: _, + } => { + #[expect(deprecated)] + let expr = Expr::Wildcard { + qualifier: None, + options: Box::new(WildcardOptions::default()), + }; + let arg_name = crate::utils::normalize_ident(name); + Ok((expr, Some(arg_name))) } _ => not_impl_err!("Unsupported qualified wildcard argument: {sql:?}"), } @@ -686,6 +743,24 @@ impl SqlToRel<'_, S> { .collect::>>() } + pub(super) fn function_args_to_expr_with_names( + &self, + args: Vec, + schema: &DFSchema, + planner_context: &mut PlannerContext, + ) -> Result<(Vec, Vec>)> { + let results: Result)>> = args + .into_iter() + .map(|a| { + self.sql_fn_arg_to_logical_expr_with_name(a, schema, planner_context) + }) + .collect(); + + let pairs = results?; + let (exprs, names): (Vec, Vec>) = pairs.into_iter().unzip(); + Ok((exprs, names)) + } + pub(crate) fn check_unnest_arg(arg: &Expr, schema: &DFSchema) -> Result<()> { // Check argument type, array types are supported match arg.get_type(schema)? { diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 23426701409e..715a02db8b02 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -20,9 +20,10 @@ use datafusion_expr::planner::{ PlannerResult, RawBinaryExpr, RawDictionaryExpr, RawFieldAccessExpr, }; use sqlparser::ast::{ - AccessExpr, BinaryOperator, CastFormat, CastKind, DataType as SQLDataType, - DictionaryField, Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias, MapEntry, - StructField, Subscript, TrimWhereField, TypedString, Value, ValueWithSpan, + AccessExpr, BinaryOperator, CastFormat, CastKind, CeilFloorKind, + DataType as SQLDataType, DateTimeField, DictionaryField, Expr as SQLExpr, + ExprWithAlias as SQLExprWithAlias, MapEntry, StructField, Subscript, TrimWhereField, + TypedString, Value, ValueWithSpan, }; use datafusion_common::{ @@ -287,7 +288,9 @@ impl SqlToRel<'_, S> { schema, planner_context, )?), - self.convert_data_type(&data_type)?, + self.convert_data_type_to_field(&data_type)? + .data_type() + .clone(), ))) } @@ -297,7 +300,9 @@ impl SqlToRel<'_, S> { uses_odbc_syntax: _, }) => Ok(Expr::Cast(Cast::new( Box::new(lit(value.into_string().unwrap())), - self.convert_data_type(&data_type)?, + self.convert_data_type_to_field(&data_type)? + .data_type() + .clone(), ))), SQLExpr::IsNull(expr) => Ok(Expr::IsNull(Box::new( @@ -494,14 +499,28 @@ impl SqlToRel<'_, S> { self.sql_grouping_sets_to_expr(exprs, schema, planner_context) } - SQLExpr::Floor { - expr, - field: _field, - } => self.sql_fn_name_to_expr(*expr, "floor", schema, planner_context), - SQLExpr::Ceil { - expr, - field: _field, - } => self.sql_fn_name_to_expr(*expr, "ceil", schema, planner_context), + SQLExpr::Floor { expr, field } => match field { + CeilFloorKind::DateTimeField(DateTimeField::NoDateTime) => { + self.sql_fn_name_to_expr(*expr, "floor", schema, planner_context) + } + CeilFloorKind::DateTimeField(_) => { + not_impl_err!("FLOOR with datetime is not supported") + } + CeilFloorKind::Scale(_) => { + not_impl_err!("FLOOR with scale is not supported") + } + }, + SQLExpr::Ceil { expr, field } => match field { + CeilFloorKind::DateTimeField(DateTimeField::NoDateTime) => { + self.sql_fn_name_to_expr(*expr, "ceil", schema, planner_context) + } + CeilFloorKind::DateTimeField(_) => { + not_impl_err!("CEIL with datetime is not supported") + } + CeilFloorKind::Scale(_) => { + not_impl_err!("CEIL with scale is not supported") + } + }, SQLExpr::Overlay { expr, overlay_what, @@ -969,12 +988,12 @@ impl SqlToRel<'_, S> { return not_impl_err!("CAST with format is not supported: {format}"); } - let dt = self.convert_data_type(&data_type)?; + let dt = self.convert_data_type_to_field(&data_type)?; let expr = self.sql_expr_to_logical_expr(expr, schema, planner_context)?; // numeric constants are treated as seconds (rather as nanoseconds) // to align with postgres / duckdb semantics - let expr = match &dt { + let expr = match dt.data_type() { DataType::Timestamp(TimeUnit::Nanosecond, tz) if expr.get_type(schema)? == DataType::Int64 => { @@ -986,7 +1005,12 @@ impl SqlToRel<'_, S> { _ => expr, }; - Ok(Expr::Cast(Cast::new(Box::new(expr), dt))) + // Currently drops metadata attached to the type + // https://github.com/apache/datafusion/issues/18060 + Ok(Expr::Cast(Cast::new( + Box::new(expr), + dt.data_type().clone(), + ))) } /// Extracts the root expression and access chain from a compound expression. diff --git a/datafusion/sql/src/expr/value.rs b/datafusion/sql/src/expr/value.rs index 7075a1afd9dd..3abb2752988f 100644 --- a/datafusion/sql/src/expr/value.rs +++ b/datafusion/sql/src/expr/value.rs @@ -20,7 +20,7 @@ use arrow::compute::kernels::cast_utils::{ parse_interval_month_day_nano_config, IntervalParseConfig, IntervalUnit, }; use arrow::datatypes::{ - i256, DataType, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, + i256, FieldRef, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, }; use bigdecimal::num_bigint::BigInt; use bigdecimal::{BigDecimal, Signed, ToPrimitive}; @@ -45,7 +45,7 @@ impl SqlToRel<'_, S> { pub(crate) fn parse_value( &self, value: Value, - param_data_types: &[DataType], + param_data_types: &[FieldRef], ) -> Result { match value { Value::Number(n, _) => self.parse_sql_number(&n, false), @@ -108,7 +108,7 @@ impl SqlToRel<'_, S> { /// number 1, 2, ... etc. For example, `$1` is the first placeholder; $2 is the second one and so on. fn create_placeholder_expr( param: String, - param_data_types: &[DataType], + param_data_types: &[FieldRef], ) -> Result { // Parse the placeholder as a number because it is the only support from sqlparser and postgres let index = param[1..].parse::(); @@ -121,7 +121,7 @@ impl SqlToRel<'_, S> { Ok(index) => index - 1, Err(_) => { return if param_data_types.is_empty() { - Ok(Expr::Placeholder(Placeholder::new(param, None))) + Ok(Expr::Placeholder(Placeholder::new_with_field(param, None))) } else { // when PREPARE Statement, param_data_types length is always 0 plan_err!("Invalid placeholder, not a number: {param}") @@ -133,7 +133,7 @@ impl SqlToRel<'_, S> { // Data type of the parameter debug!("type of param {param} param_data_types[idx]: {param_type:?}"); - Ok(Expr::Placeholder(Placeholder::new( + Ok(Expr::Placeholder(Placeholder::new_with_field( param, param_type.cloned(), ))) diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 271ad8a856b4..1f1ef2a672ab 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -243,7 +243,19 @@ impl fmt::Display for CreateExternalTable { } write!(f, "{} ", self.name)?; write!(f, "STORED AS {} ", self.file_type)?; - write!(f, "LOCATION {} ", self.location) + if !self.order_exprs.is_empty() { + write!(f, "WITH ORDER (")?; + let mut first = true; + for expr in self.order_exprs.iter().flatten() { + if !first { + write!(f, ", ")?; + } + write!(f, "{expr}")?; + first = false; + } + write!(f, ") ")?; + } + write!(f, "LOCATION {}", self.location) } } diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index e93c5e066b66..99138e1b0016 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -21,8 +21,10 @@ use std::str::FromStr; use std::sync::Arc; use std::vec; +use crate::utils::make_decimal_type; use arrow::datatypes::*; use datafusion_common::config::SqlParserOptions; +use datafusion_common::datatype::{DataTypeExt, FieldExt}; use datafusion_common::error::add_possible_columns_to_diag; use datafusion_common::TableReference; use datafusion_common::{ @@ -31,15 +33,13 @@ use datafusion_common::{ }; use datafusion_common::{not_impl_err, plan_err, DFSchema, DataFusionError, Result}; use datafusion_expr::logical_plan::{LogicalPlan, LogicalPlanBuilder}; +pub use datafusion_expr::planner::ContextProvider; use datafusion_expr::utils::find_column_exprs; use datafusion_expr::{col, Expr}; use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo, TimezoneInfo}; use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias}; -use crate::utils::make_decimal_type; -pub use datafusion_expr::planner::ContextProvider; - /// SQL parser options #[derive(Debug, Clone, Copy)] pub struct ParserOptions { @@ -256,7 +256,7 @@ impl IdentNormalizer { pub struct PlannerContext { /// Data types for numbered parameters ($1, $2, etc), if supplied /// in `PREPARE` statement - prepare_param_data_types: Arc>, + prepare_param_data_types: Arc>, /// Map of CTE name to logical plan of the WITH clause. /// Use `Arc` to allow cheap cloning ctes: HashMap>, @@ -290,7 +290,7 @@ impl PlannerContext { /// Update the PlannerContext with provided prepare_param_data_types pub fn with_prepare_param_data_types( mut self, - prepare_param_data_types: Vec, + prepare_param_data_types: Vec, ) -> Self { self.prepare_param_data_types = prepare_param_data_types.into(); self @@ -347,7 +347,7 @@ impl PlannerContext { } /// Return the types of parameters (`$1`, `$2`, etc) if known - pub fn prepare_param_data_types(&self) -> &[DataType] { + pub fn prepare_param_data_types(&self) -> &[FieldRef] { &self.prepare_param_data_types } @@ -428,16 +428,18 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let mut fields = Vec::with_capacity(columns.len()); for column in columns { - let data_type = self.convert_data_type(&column.data_type)?; + let data_type = self.convert_data_type_to_field(&column.data_type)?; let not_nullable = column .options .iter() .any(|x| x.option == ColumnOption::NotNull); - fields.push(Field::new( - self.ident_normalizer.normalize(column.name), - data_type, - !not_nullable, - )); + fields.push( + data_type + .as_ref() + .clone() + .with_name(self.ident_normalizer.normalize(column.name)) + .with_nullable(!not_nullable), + ); } Ok(Schema::new(fields)) @@ -587,11 +589,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { }) } - pub(crate) fn convert_data_type(&self, sql_type: &SQLDataType) -> Result { + pub(crate) fn convert_data_type_to_field( + &self, + sql_type: &SQLDataType, + ) -> Result { // First check if any of the registered type_planner can handle this type if let Some(type_planner) = self.context_provider.get_type_planner() { if let Some(data_type) = type_planner.plan_type(sql_type)? { - return Ok(data_type); + return Ok(data_type.into_nullable_field_ref()); } } @@ -599,28 +604,30 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { match sql_type { SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_sql_type)) => { // Arrays may be multi-dimensional. - let inner_data_type = self.convert_data_type(inner_sql_type)?; - Ok(DataType::new_list(inner_data_type, true)) + Ok(self.convert_data_type_to_field(inner_sql_type)?.into_list()) } SQLDataType::Array(ArrayElemTypeDef::SquareBracket( inner_sql_type, maybe_array_size, )) => { - let inner_data_type = self.convert_data_type(inner_sql_type)?; + let inner_field = self.convert_data_type_to_field(inner_sql_type)?; if let Some(array_size) = maybe_array_size { - Ok(DataType::new_fixed_size_list( - inner_data_type, - *array_size as i32, - true, - )) + let array_size: i32 = (*array_size).try_into().map_err(|_| { + plan_datafusion_err!( + "Array size must be a positive 32 bit integer, got {array_size}" + ) + })?; + Ok(inner_field.into_fixed_size_list(array_size)) } else { - Ok(DataType::new_list(inner_data_type, true)) + Ok(inner_field.into_list()) } } SQLDataType::Array(ArrayElemTypeDef::None) => { not_impl_err!("Arrays with unspecified type is not supported") } - other => self.convert_simple_data_type(other), + other => Ok(self + .convert_simple_data_type(other)? + .into_nullable_field_ref()), } } @@ -733,17 +740,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let fields = fields .iter() .enumerate() - .map(|(idx, field)| { - let data_type = self.convert_data_type(&field.field_type)?; - let field_name = match &field.field_name { + .map(|(idx, sql_struct_field)| { + let field = self.convert_data_type_to_field(&sql_struct_field.field_type)?; + let field_name = match &sql_struct_field.field_name { Some(ident) => ident.clone(), None => Ident::new(format!("c{idx}")), }; - Ok(Arc::new(Field::new( - self.ident_normalizer.normalize(field_name), - data_type, - true, - ))) + Ok(field.as_ref().clone().with_name(self.ident_normalizer.normalize(field_name))) }) .collect::>>()?; Ok(DataType::Struct(Fields::from(fields))) diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 0e868e8c2689..81381bf49fc5 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -29,7 +29,7 @@ use crate::planner::{ }; use crate::utils::normalize_ident; -use arrow::datatypes::{DataType, Fields}; +use arrow::datatypes::{Field, FieldRef, Fields}; use datafusion_common::error::_plan_err; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ @@ -242,6 +242,16 @@ impl SqlToRel<'_, S> { table_name, .. } => self.describe_table_to_plan(table_name), + Statement::Explain { + describe_alias: DescribeAlias::Describe | DescribeAlias::Desc, // only parse 'DESCRIBE statement' or 'DESC statement' and not 'EXPLAIN statement' + statement, + .. + } => match *statement { + Statement::Query(query) => self.describe_query_to_plan(*query), + _ => { + not_impl_err!("Describing statements other than SELECT not supported") + } + }, Statement::Explain { verbose, statement, @@ -730,14 +740,14 @@ impl SqlToRel<'_, S> { statement, } => { // Convert parser data types to DataFusion data types - let mut data_types: Vec = data_types + let mut fields: Vec = data_types .into_iter() - .map(|t| self.convert_data_type(&t)) + .map(|t| self.convert_data_type_to_field(&t)) .collect::>()?; // Create planner context with parameters - let mut planner_context = PlannerContext::new() - .with_prepare_param_data_types(data_types.clone()); + let mut planner_context = + PlannerContext::new().with_prepare_param_data_types(fields.clone()); // Build logical plan for inner statement of the prepare statement let plan = self.sql_statement_to_plan_with_context_impl( @@ -745,21 +755,21 @@ impl SqlToRel<'_, S> { &mut planner_context, )?; - if data_types.is_empty() { - let map_types = plan.get_parameter_types()?; + if fields.is_empty() { + let map_types = plan.get_parameter_fields()?; let param_types: Vec<_> = (1..=map_types.len()) .filter_map(|i| { let key = format!("${i}"); map_types.get(&key).and_then(|opt| opt.clone()) }) .collect(); - data_types.extend(param_types.iter().cloned()); + fields.extend(param_types.iter().cloned()); planner_context.with_prepare_param_data_types(param_types); } Ok(LogicalPlan::Statement(PlanStatement::Prepare(Prepare { name: ident_to_string(&name), - data_types, + fields, input: Arc::new(plan), }))) } @@ -1179,7 +1189,7 @@ impl SqlToRel<'_, S> { .. }) => { let return_type = match return_type { - Some(t) => Some(self.convert_data_type(&t)?), + Some(t) => Some(self.convert_data_type_to_field(&t)?), None => None, }; let mut planner_context = PlannerContext::new(); @@ -1190,7 +1200,8 @@ impl SqlToRel<'_, S> { let function_args = function_args .into_iter() .map(|arg| { - let data_type = self.convert_data_type(&arg.data_type)?; + let data_type = + self.convert_data_type_to_field(&arg.data_type)?; let default_expr = match arg.default_expr { Some(expr) => Some(self.sql_to_expr( @@ -1203,7 +1214,7 @@ impl SqlToRel<'_, S> { Ok(OperateFunctionArg { name: arg.name, default_expr, - data_type, + data_type: data_type.data_type().clone(), }) }) .collect::>>(); @@ -1221,7 +1232,9 @@ impl SqlToRel<'_, S> { // Convert resulting expression to data fusion expression // let arg_types = args.as_ref().map(|arg| { - arg.iter().map(|t| t.data_type.clone()).collect::>() + arg.iter() + .map(|t| Arc::new(Field::new("", t.data_type.clone(), true))) + .collect::>() }); let mut planner_context = PlannerContext::new() .with_prepare_param_data_types(arg_types.unwrap_or_default()); @@ -1264,7 +1277,7 @@ impl SqlToRel<'_, S> { or_replace, temporary, name, - return_type, + return_type: return_type.map(|f| f.data_type().clone()), args, params, schema: DFSchemaRef::new(DFSchema::empty()), @@ -1396,6 +1409,19 @@ impl SqlToRel<'_, S> { })) } + fn describe_query_to_plan(&self, query: Query) -> Result { + let plan = self.query_to_plan(query, &mut PlannerContext::new())?; + + let schema = Arc::new(plan.schema().as_arrow().clone()); + + let output_schema = DFSchema::try_from(LogicalPlan::describe_schema()).unwrap(); + + Ok(LogicalPlan::DescribeTable(DescribeTable { + schema, + output_schema: Arc::new(output_schema), + })) + } + fn copy_to_plan(&self, statement: CopyToStatement) -> Result { // Determine if source is table or query and handle accordingly let copy_source = statement.source; @@ -1998,10 +2024,10 @@ impl SqlToRel<'_, S> { )?; // Update placeholder's datatype to the type of the target column if let Expr::Placeholder(placeholder) = &mut expr { - placeholder.data_type = placeholder - .data_type + placeholder.field = placeholder + .field .take() - .or_else(|| Some(field.data_type().clone())); + .or_else(|| Some(Arc::clone(field))); } // Cast to target column type, if necessary expr.cast_to(field.data_type(), source.schema())? @@ -2105,8 +2131,7 @@ impl SqlToRel<'_, S> { idx + 1 ) })?; - let dt = field.data_type().clone(); - let _ = prepare_param_data_types.insert(name, dt); + let _ = prepare_param_data_types.insert(name, Arc::clone(field)); } } } diff --git a/datafusion/sql/tests/cases/params.rs b/datafusion/sql/tests/cases/params.rs index 343a90af3efb..147628656d8f 100644 --- a/datafusion/sql/tests/cases/params.rs +++ b/datafusion/sql/tests/cases/params.rs @@ -16,8 +16,12 @@ // under the License. use crate::logical_plan; -use arrow::datatypes::DataType; -use datafusion_common::{assert_contains, ParamValues, ScalarValue}; +use arrow::datatypes::{DataType, Field, FieldRef}; +use datafusion_common::{ + assert_contains, + metadata::{format_type_and_metadata, ScalarAndMetadata}, + ParamValues, ScalarValue, +}; use datafusion_expr::{LogicalPlan, Prepare, Statement}; use insta::assert_snapshot; use itertools::Itertools as _; @@ -51,12 +55,42 @@ impl ParameterTest<'_> { } } +pub struct ParameterTestWithMetadata<'a> { + pub sql: &'a str, + pub expected_types: Vec<(&'a str, Option)>, + pub param_values: Vec, +} + +impl ParameterTestWithMetadata<'_> { + pub fn run(&self) -> String { + let plan = logical_plan(self.sql).unwrap(); + + let actual_types = plan.get_parameter_fields().unwrap(); + let expected_types: HashMap> = self + .expected_types + .iter() + .map(|(k, v)| ((*k).to_string(), v.clone())) + .collect(); + + assert_eq!(actual_types, expected_types); + + let plan_with_params = plan + .clone() + .with_param_values(ParamValues::List(self.param_values.clone())) + .unwrap(); + + format!("** Initial Plan:\n{plan}\n** Final Plan:\n{plan_with_params}") + } +} + fn generate_prepare_stmt_and_data_types(sql: &str) -> (LogicalPlan, String) { let plan = logical_plan(sql).unwrap(); let data_types = match &plan { - LogicalPlan::Statement(Statement::Prepare(Prepare { data_types, .. })) => { - data_types.iter().join(", ").to_string() - } + LogicalPlan::Statement(Statement::Prepare(Prepare { fields, .. })) => fields + .iter() + .map(|f| format_type_and_metadata(f.data_type(), Some(f.metadata()))) + .join(", ") + .to_string(), _ => panic!("Expected a Prepare statement"), }; (plan, data_types) @@ -633,11 +667,11 @@ fn test_insert_infer() { @r#" ** Initial Plan: Dml: op=[Insert Into] table=[person] - Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀 + Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀 Values: ($1, $2, $3) ** Final Plan: Dml: op=[Insert Into] table=[person] - Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀 + Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀 Values: (UInt32(1) AS $1, Utf8("Alan") AS $2, Utf8("Turing") AS $3) "# ); @@ -664,11 +698,11 @@ fn test_prepare_statement_insert_infer() { ** Initial Plan: Prepare: "my_plan" [UInt32, Utf8, Utf8] Dml: op=[Insert Into] table=[person] - Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀 + Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀 Values: ($1, $2, $3) ** Final Plan: Dml: op=[Insert Into] table=[person] - Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀 + Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀 Values: (UInt32(1) AS $1, Utf8("Alan") AS $2, Utf8("Turing") AS $3) "# ); @@ -704,6 +738,147 @@ fn test_prepare_statement_to_plan_one_param() { ); } +#[test] +fn test_update_infer_with_metadata() { + // Here the uuid field is inferred as nullable because it appears in the filter + // (and not in the update values, where its nullability would be inferred) + let uuid_field = Field::new("", DataType::FixedSizeBinary(16), true).with_metadata( + [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())].into(), + ); + let uuid_bytes = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let expected_types = vec![ + ( + "$1", + Some(Field::new("last_name", DataType::Utf8, false).into()), + ), + ("$2", Some(uuid_field.clone().with_name("id").into())), + ]; + let param_values = vec![ + ScalarAndMetadata::from(ScalarValue::from("Turing")), + ScalarAndMetadata::new( + ScalarValue::FixedSizeBinary(16, Some(uuid_bytes)), + Some(uuid_field.metadata().into()), + ), + ]; + + // Check a normal update + let test = ParameterTestWithMetadata { + sql: "update person_with_uuid_extension set last_name=$1 where id=$2", + expected_types: expected_types.clone(), + param_values: param_values.clone(), + }; + + assert_snapshot!( + test.run(), + @r#" + ** Initial Plan: + Dml: op=[Update] table=[person_with_uuid_extension] + Projection: person_with_uuid_extension.id AS id, person_with_uuid_extension.first_name AS first_name, $1 AS last_name + Filter: person_with_uuid_extension.id = $2 + TableScan: person_with_uuid_extension + ** Final Plan: + Dml: op=[Update] table=[person_with_uuid_extension] + Projection: person_with_uuid_extension.id AS id, person_with_uuid_extension.first_name AS first_name, Utf8("Turing") AS last_name + Filter: person_with_uuid_extension.id = FixedSizeBinary(16, "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16") FieldMetadata { inner: {"ARROW:extension:name": "arrow.uuid"} } + TableScan: person_with_uuid_extension + "# + ); + + // Check a prepared update + let test = ParameterTestWithMetadata { + sql: "PREPARE my_plan AS update person_with_uuid_extension set last_name=$1 where id=$2", + expected_types, + param_values + }; + + assert_snapshot!( + test.run(), + @r#" + ** Initial Plan: + Prepare: "my_plan" [Utf8, FixedSizeBinary(16)<{"ARROW:extension:name": "arrow.uuid"}>] + Dml: op=[Update] table=[person_with_uuid_extension] + Projection: person_with_uuid_extension.id AS id, person_with_uuid_extension.first_name AS first_name, $1 AS last_name + Filter: person_with_uuid_extension.id = $2 + TableScan: person_with_uuid_extension + ** Final Plan: + Dml: op=[Update] table=[person_with_uuid_extension] + Projection: person_with_uuid_extension.id AS id, person_with_uuid_extension.first_name AS first_name, Utf8("Turing") AS last_name + Filter: person_with_uuid_extension.id = FixedSizeBinary(16, "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16") FieldMetadata { inner: {"ARROW:extension:name": "arrow.uuid"} } + TableScan: person_with_uuid_extension + "# + ); +} + +#[test] +fn test_insert_infer_with_metadata() { + let uuid_field = Field::new("", DataType::FixedSizeBinary(16), false).with_metadata( + [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())].into(), + ); + let uuid_bytes = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let expected_types = vec![ + ("$1", Some(uuid_field.clone().with_name("id").into())), + ( + "$2", + Some(Field::new("first_name", DataType::Utf8, false).into()), + ), + ( + "$3", + Some(Field::new("last_name", DataType::Utf8, false).into()), + ), + ]; + let param_values = vec![ + ScalarAndMetadata::new( + ScalarValue::FixedSizeBinary(16, Some(uuid_bytes)), + Some(uuid_field.metadata().into()), + ), + ScalarAndMetadata::from(ScalarValue::from("Alan")), + ScalarAndMetadata::from(ScalarValue::from("Turing")), + ]; + + // Check a normal insert + let test = ParameterTestWithMetadata { + sql: "insert into person_with_uuid_extension (id, first_name, last_name) values ($1, $2, $3)", + expected_types: expected_types.clone(), + param_values: param_values.clone() + }; + + assert_snapshot!( + test.run(), + @r#" + ** Initial Plan: + Dml: op=[Insert Into] table=[person_with_uuid_extension] + Projection: column1 AS id, column2 AS first_name, column3 AS last_name + Values: ($1, $2, $3) + ** Final Plan: + Dml: op=[Insert Into] table=[person_with_uuid_extension] + Projection: column1 AS id, column2 AS first_name, column3 AS last_name + Values: (FixedSizeBinary(16, "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16") FieldMetadata { inner: {"ARROW:extension:name": "arrow.uuid"} } AS $1, Utf8("Alan") AS $2, Utf8("Turing") AS $3) + "# + ); + + // Check a prepared insert + let test = ParameterTestWithMetadata { + sql: "PREPARE my_plan AS insert into person_with_uuid_extension (id, first_name, last_name) values ($1, $2, $3)", + expected_types, + param_values + }; + + assert_snapshot!( + test.run(), + @r#" + ** Initial Plan: + Prepare: "my_plan" [FixedSizeBinary(16)<{"ARROW:extension:name": "arrow.uuid"}>, Utf8, Utf8] + Dml: op=[Insert Into] table=[person_with_uuid_extension] + Projection: column1 AS id, column2 AS first_name, column3 AS last_name + Values: ($1, $2, $3) + ** Final Plan: + Dml: op=[Insert Into] table=[person_with_uuid_extension] + Projection: column1 AS id, column2 AS first_name, column3 AS last_name + Values: (FixedSizeBinary(16, "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16") FieldMetadata { inner: {"ARROW:extension:name": "arrow.uuid"} } AS $1, Utf8("Alan") AS $2, Utf8("Turing") AS $3) + "# + ); +} + #[test] fn test_prepare_statement_to_plan_data_type() { let sql = "PREPARE my_plan(DOUBLE) AS SELECT id, age FROM person WHERE age = $1"; diff --git a/datafusion/sql/tests/common/mod.rs b/datafusion/sql/tests/common/mod.rs index ee1b761970de..5d9fd9f2c374 100644 --- a/datafusion/sql/tests/common/mod.rs +++ b/datafusion/sql/tests/common/mod.rs @@ -151,6 +151,14 @@ impl ContextProvider for MockContextProvider { ), Field::new("😀", DataType::Int32, false), ])), + "person_with_uuid_extension" => Ok(Schema::new(vec![ + Field::new("id", DataType::FixedSizeBinary(16), false).with_metadata( + [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())] + .into(), + ), + Field::new("first_name", DataType::Utf8, false), + Field::new("last_name", DataType::Utf8, false), + ])), "orders" => Ok(Schema::new(vec![ Field::new("order_id", DataType::UInt32, false), Field::new("customer_id", DataType::UInt32, false), diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index f66af28f436e..96d9f23522f1 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -669,10 +669,10 @@ fn plan_insert() { assert_snapshot!( plan, @r#" - Dml: op=[Insert Into] table=[person] - Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀 - Values: (CAST(Int64(1) AS UInt32), Utf8("Alan"), Utf8("Turing")) - "# + Dml: op=[Insert Into] table=[person] + Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀 + Values: (CAST(Int64(1) AS UInt32), Utf8("Alan"), Utf8("Turing")) + "# ); } @@ -875,11 +875,11 @@ fn test_timestamp_filter() { let plan = logical_plan(sql).unwrap(); assert_snapshot!( plan, - @r#" - Projection: person.state - Filter: person.birth_date < CAST(CAST(Int64(158412331400600000) AS Timestamp(Second, None)) AS Timestamp(Nanosecond, None)) - TableScan: person - "# + @r" + Projection: person.state + Filter: person.birth_date < CAST(CAST(Int64(158412331400600000) AS Timestamp(s)) AS Timestamp(ns)) + TableScan: person + " ); } @@ -1586,11 +1586,11 @@ fn select_from_typed_string_values() { assert_snapshot!( plan, @r#" - Projection: t.col1, t.col2 - SubqueryAlias: t - Projection: column1 AS col1, column2 AS col2 - Values: (CAST(Utf8("2021-06-10 17:01:00Z") AS Timestamp(Nanosecond, None)), CAST(Utf8("2004-04-09") AS Date32)) - "# + Projection: t.col1, t.col2 + SubqueryAlias: t + Projection: column1 AS col1, column2 AS col2 + Values: (CAST(Utf8("2021-06-10 17:01:00Z") AS Timestamp(ns)), CAST(Utf8("2004-04-09") AS Date32)) + "# ); } @@ -3151,7 +3151,7 @@ fn select_typed_time_string() { assert_snapshot!( plan, @r#" - Projection: CAST(Utf8("08:09:10.123") AS Time64(Nanosecond)) AS time + Projection: CAST(Utf8("08:09:10.123") AS Time64(ns)) AS time EmptyRelation: rows=1 "# ); @@ -4686,7 +4686,7 @@ fn test_custom_type_plan() -> Result<()> { assert_snapshot!( plan, @r#" - Projection: CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(Nanosecond, None)) + Projection: CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(ns)) EmptyRelation: rows=1 "# ); @@ -4696,7 +4696,7 @@ fn test_custom_type_plan() -> Result<()> { assert_snapshot!( plan, @r#" - Projection: CAST(CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(Nanosecond, None)) AS Timestamp(Nanosecond, None)) + Projection: CAST(CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(ns)) AS Timestamp(ns)) EmptyRelation: rows=1 "# ); @@ -4708,7 +4708,7 @@ fn test_custom_type_plan() -> Result<()> { assert_snapshot!( plan, @r#" - Projection: make_array(CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(Nanosecond, None)), CAST(Utf8("2001-01-02 18:00:00") AS Timestamp(Nanosecond, None))) + Projection: make_array(CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(ns)), CAST(Utf8("2001-01-02 18:00:00") AS Timestamp(ns))) EmptyRelation: rows=1 "# ); diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index d02d5f9cb5e4..8ab3932e8433 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -42,7 +42,7 @@ async-trait = { workspace = true } bigdecimal = { workspace = true } bytes = { workspace = true, optional = true } chrono = { workspace = true, optional = true } -clap = { version = "4.5.47", features = ["derive", "env"] } +clap = { version = "4.5.50", features = ["derive", "env"] } datafusion = { workspace = true, default-features = true, features = ["avro", "parquet_encryption"] } datafusion-spark = { workspace = true, default-features = true } datafusion-substrait = { workspace = true, default-features = true } diff --git a/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs b/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs index 375f06d34b44..4d310711687f 100644 --- a/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs +++ b/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs @@ -76,8 +76,8 @@ impl Postgres { /// /// See https://docs.rs/tokio-postgres/latest/tokio_postgres/config/struct.Config.html#url for format pub async fn connect(relative_path: PathBuf, pb: ProgressBar) -> Result { - let uri = - std::env::var("PG_URI").map_or(PG_URI.to_string(), std::convert::identity); + let uri = std::env::var("PG_URI") + .map_or_else(|_| PG_URI.to_string(), std::convert::identity); info!("Using postgres connection string: {uri}"); diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 43899642a93a..144e3b757adf 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -710,13 +710,13 @@ select query TTT select arrow_typeof(column1), arrow_typeof(column2), arrow_typeof(column3) from arrays; ---- -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) # arrays table query ??? @@ -1182,7 +1182,7 @@ select make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)')) query T select arrow_typeof(make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)'))); ---- -List(Field { name: "item", data_type: LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable LargeList(nullable Int64)) query ??? @@ -3292,7 +3292,7 @@ select array_concat([arrow_cast('1', 'Utf8'), arrow_cast('2', 'Utf8')], [arrow_cast('3', 'Utf8View')]), arrow_typeof(array_concat([arrow_cast('1', 'Utf8'), arrow_cast('2', 'Utf8')], [arrow_cast('3', 'Utf8View')])); ---- -[1, 2, 3] List(Field { name: "item", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[1, 2, 3] List(nullable Utf8View) # array_concat error query error DataFusion error: Error during planning: Execution error: Function 'array_concat' user-defined coercion failed with "Error during planning: array_concat does not support type Int64" @@ -4585,7 +4585,7 @@ NULL [baz] baz query T SELECT arrow_typeof(make_array(arrow_cast('a', 'Utf8View'), 'b', 'c', 'd')); ---- -List(Field { name: "item", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable Utf8View) # expect a,b,c,d. make_array forces all types to be of a common type (see above) query T @@ -6054,7 +6054,7 @@ NULL NULL # array_has([], 1) -> 'false' (empty array should return false) # array_has(null, 1) -> 'null' (null array should return null) query ?T -SELECT column1, COALESCE(CAST(array_has(column1, column2) AS VARCHAR), 'null') +SELECT column1, COALESCE(CAST(array_has(column1, column2) AS VARCHAR), 'null') from array_has_table_empty; ---- [1, 3, 5] true @@ -6315,7 +6315,7 @@ true false false true false true false false NULL NULL false false false false NULL false -false false false NULL +false false false NULL query BBBB select array_has_all(make_array(1,2,3), []), @@ -7131,7 +7131,7 @@ select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, [2021-01-01T00:00:00-05:00, 2021-01-01T01:29:54.500-05:00, 2021-01-01T02:59:49-05:00, 2021-01-01T04:29:43.500-05:00, 2021-01-01T05:59:38-05:00] ## mixing types for timestamps is not supported -query error DataFusion error: Internal error: Unexpected argument type for GENERATE_SERIES : Date32 +query error DataFusion error: Internal error: Unexpected argument type for generate_series : Date32 select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), DATE '2021-01-02', INTERVAL '1' HOUR); @@ -7239,7 +7239,7 @@ query error DataFusion error: Execution error: step can't be 0 for function gene select generate_series(1, 1, 0); # Test generate_series with zero step -query error DataFusion error: Execution error: Interval argument to GENERATE_SERIES must not be 0 +query error DataFusion error: Execution error: Interval argument to generate_series must not be 0 select generate_series(TIMESTAMP '2000-01-02', TIMESTAMP '2000-01-01', INTERVAL '0' MINUTE); # Test generate_series with big steps @@ -7653,8 +7653,8 @@ CREATE EXTERNAL TABLE fixed_size_list_array STORED AS PARQUET LOCATION '../core/ query T select arrow_typeof(f0) from fixed_size_list_array; ---- -FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 2) -FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 2) +FixedSizeList(2 x nullable Int64) +FixedSizeList(2 x nullable Int64) query ? select * from fixed_size_list_array; @@ -7683,8 +7683,8 @@ select make_array(arrow_cast(f0, 'List(Int64)')) from fixed_size_list_array query T select arrow_typeof(make_array(arrow_cast(f0, 'List(Int64)'))) from fixed_size_list_array ---- -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable List(nullable Int64)) +List(nullable List(nullable Int64)) query ? select make_array(f0) from fixed_size_list_array @@ -7695,8 +7695,8 @@ select make_array(f0) from fixed_size_list_array query T select arrow_typeof(make_array(f0)) from fixed_size_list_array ---- -List(Field { name: "item", data_type: FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 2), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 2), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable FixedSizeList(2 x nullable Int64)) +List(nullable FixedSizeList(2 x nullable Int64)) query ? select array_concat(column1, [7]) from arrays_values_v2; @@ -8209,7 +8209,7 @@ select array_reverse(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array [3, 2, 1] [1] query ???? -select array_reverse(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), +select array_reverse(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_reverse(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')), array_reverse(arrow_cast(make_array(1, NULL, 3), 'FixedSizeList(3, Int64)')), array_reverse(arrow_cast(make_array(NULL, NULL, NULL), 'FixedSizeList(3, Int64)')); @@ -8275,19 +8275,19 @@ select * from test_create_array_table; query T select arrow_typeof(a) from test_create_array_table; ---- -List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable Int32) query T select arrow_typeof(c) from test_create_array_table; ---- -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable List(nullable Int32)) # Test casting to array types # issue: https://github.com/apache/datafusion/issues/9440 query ??T select [1,2,3]::int[], [['1']]::int[][], arrow_typeof([]::text[]); ---- -[1, 2, 3] [[1]] List(Field { name: "item", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[1, 2, 3] [[1]] List(nullable Utf8View) # test empty arrays return length # issue: https://github.com/apache/datafusion/pull/12459 @@ -8307,8 +8307,8 @@ create table fixed_size_col_table (a int[3]) as values ([1,2,3]), ([4,5,6]); query T select arrow_typeof(a) from fixed_size_col_table; ---- -FixedSizeList(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) -FixedSizeList(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) +FixedSizeList(3 x nullable Int32) +FixedSizeList(3 x nullable Int32) query ? rowsort SELECT DISTINCT a FROM fixed_size_col_table diff --git a/datafusion/sqllogictest/test_files/arrow_typeof.slt b/datafusion/sqllogictest/test_files/arrow_typeof.slt index 654218531f1d..ac32ef821bc4 100644 --- a/datafusion/sqllogictest/test_files/arrow_typeof.slt +++ b/datafusion/sqllogictest/test_files/arrow_typeof.slt @@ -61,13 +61,13 @@ Decimal128(38, 10) query T SELECT arrow_typeof(now()::timestamp) ---- -Timestamp(Nanosecond, None) +Timestamp(ns) # arrow_typeof_timestamp_utc query T SELECT arrow_typeof(now()) ---- -Timestamp(Nanosecond, Some("+00:00")) +Timestamp(ns, "+00:00") # arrow_typeof_timestamp_date32( query T @@ -98,7 +98,7 @@ SELECT arrow_cast('1') query error DataFusion error: Execution error: arrow_cast requires its second argument to be a non\-empty constant string SELECT arrow_cast('1', 43) -query error Error unrecognized word: unknown +query error DataFusion error: Execution error: Unsupported type 'unknown'\. Must be a supported arrow type name such as 'Int32' or 'Timestamp\(ns\)'\. Error unknown token: unknown SELECT arrow_cast('1', 'unknown') # Round Trip tests: @@ -130,7 +130,7 @@ SELECT arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Nanosecond, Some("+08:00"))')) as col_tstz_ns, arrow_typeof(arrow_cast('foo', 'Dictionary(Int32, Utf8)')) as col_dict ---- -Int8 Int16 Int32 Int64 UInt8 UInt16 UInt32 UInt64 Float16 Float32 Float64 Utf8 LargeUtf8 Utf8View Binary LargeBinary Timestamp(Second, None) Timestamp(Millisecond, None) Timestamp(Microsecond, None) Timestamp(Nanosecond, None) Timestamp(Second, Some("+08:00")) Timestamp(Millisecond, Some("+08:00")) Timestamp(Microsecond, Some("+08:00")) Timestamp(Nanosecond, Some("+08:00")) Dictionary(Int32, Utf8) +Int8 Int16 Int32 Int64 UInt8 UInt16 UInt32 UInt64 Float16 Float32 Float64 Utf8 LargeUtf8 Utf8View Binary LargeBinary Timestamp(s) Timestamp(ms) Timestamp(µs) Timestamp(ns) Timestamp(s, "+08:00") Timestamp(ms, "+08:00") Timestamp(µs, "+08:00") Timestamp(ns, "+08:00") Dictionary(Int32, Utf8) @@ -255,7 +255,7 @@ SELECT arrow_typeof(col_ts_ns) FROM foo; ---- -Timestamp(Second, None) Timestamp(Millisecond, None) Timestamp(Microsecond, None) Timestamp(Nanosecond, None) +Timestamp(s) Timestamp(ms) Timestamp(µs) Timestamp(ns) statement ok @@ -316,7 +316,7 @@ select arrow_cast(interval '30 minutes', 'Duration(Second)'); ---- 0 days 0 hours 30 mins 0 secs -query error DataFusion error: This feature is not implemented: Unsupported CAST from Utf8 to Duration\(Second\) +query error DataFusion error: This feature is not implemented: Unsupported CAST from Utf8 to Duration\(s\) select arrow_cast('30 minutes', 'Duration(Second)'); @@ -357,12 +357,12 @@ select arrow_cast(make_array(1, 2, 3), 'List(Int64)'); query T select arrow_typeof(arrow_cast(make_array(1, 2, 3), 'List(Int64)')); ---- -List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable Int64) query T select arrow_typeof(arrow_cast(arrow_cast(make_array([1, 2, 3]), 'LargeList(LargeList(Int64))'), 'List(List(Int64))')); ---- -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable List(nullable Int64)) ## LargeList @@ -380,12 +380,12 @@ select arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'); query T select arrow_typeof(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')); ---- -LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +LargeList(nullable Int64) query T select arrow_typeof(arrow_cast(make_array([1, 2, 3]), 'LargeList(LargeList(Int64))')); ---- -LargeList(Field { name: "item", data_type: LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +LargeList(nullable LargeList(nullable Int64)) ## FixedSizeList @@ -417,7 +417,7 @@ select arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'); query T select arrow_typeof(arrow_cast(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 'FixedSizeList(3, Int64)')); ---- -FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) +FixedSizeList(3 x nullable Int64) query ? select arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)'); diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt index 2f9173d2dcbd..4eaa87b0b516 100644 --- a/datafusion/sqllogictest/test_files/case.slt +++ b/datafusion/sqllogictest/test_files/case.slt @@ -594,4 +594,26 @@ query I SELECT CASE WHEN a = 'a' THEN 0 WHEN a = 'b' THEN 1 ELSE 2 END FROM (VALUES (NULL), ('z')) t(a) ---- 2 -2 \ No newline at end of file +2 + +# The `WHEN 1/0` is not effectively reachable in this query and should never be executed +query T +SELECT CASE a WHEN 1 THEN 'a' WHEN 2 THEN 'b' WHEN 1 / 0 THEN 'c' ELSE 'd' END FROM (VALUES (1), (2)) t(a) +---- +a +b + +# The `WHEN 1/0` is not effectively reachable in this query and should never be executed +query T +SELECT CASE WHEN a = 1 THEN 'a' WHEN a = 2 THEN 'b' WHEN a = 1 / 0 THEN 'c' ELSE 'd' END FROM (VALUES (1), (2)) t(a) +---- +a +b + +# The `WHEN 1/0` is not effectively reachable in this query and should never be executed +query T +SELECT CASE WHEN a = 0 THEN 'a' WHEN 1 / a = 1 THEN 'b' ELSE 'c' END FROM (VALUES (0), (1), (2)) t(a) +---- +a +b +c diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt index 9740bade5e27..e34a601851d7 100644 --- a/datafusion/sqllogictest/test_files/coalesce.slt +++ b/datafusion/sqllogictest/test_files/coalesce.slt @@ -199,14 +199,14 @@ select coalesce(array[1, 2], array[3, 4]), arrow_typeof(coalesce(array[1, 2], array[3, 4])); ---- -[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[1, 2] List(nullable Int64) query ?T select coalesce(null, array[3, 4]), arrow_typeof(coalesce(array[1, 2], array[3, 4])); ---- -[3, 4] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[3, 4] List(nullable Int64) # coalesce with array query ?T @@ -214,7 +214,7 @@ select coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]), arrow_typeof(coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')])); ---- -[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[1, 2] List(nullable Int64) # test dict(int32, utf8) statement ok diff --git a/datafusion/sqllogictest/test_files/count_star_rule.slt b/datafusion/sqllogictest/test_files/count_star_rule.slt index 826742267290..b78c021a565c 100644 --- a/datafusion/sqllogictest/test_files/count_star_rule.slt +++ b/datafusion/sqllogictest/test_files/count_star_rule.slt @@ -88,7 +88,7 @@ logical_plan 03)----TableScan: t1 projection=[a] physical_plan 01)ProjectionExec: expr=[a@0 as a, count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as count_a] -02)--WindowAggExec: wdw=[count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +02)--WindowAggExec: wdw=[count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 03)----SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false] 04)------DataSourceExec: partitions=1, partition_sizes=[1] diff --git a/datafusion/sqllogictest/test_files/current_time_timezone.slt b/datafusion/sqllogictest/test_files/current_time_timezone.slt index a9e27bd4045f..c80c4b51d5ac 100644 --- a/datafusion/sqllogictest/test_files/current_time_timezone.slt +++ b/datafusion/sqllogictest/test_files/current_time_timezone.slt @@ -29,7 +29,7 @@ true query T SELECT arrow_typeof(current_time()); ---- -Time64(Nanosecond) +Time64(ns) # Test 3: Set timezone to +08:00 and verify current_time is still stable statement ok @@ -44,7 +44,7 @@ true query T SELECT arrow_typeof(current_time()); ---- -Time64(Nanosecond) +Time64(ns) # Test 5: Test with negative offset timezone statement ok diff --git a/datafusion/sqllogictest/test_files/dates.slt b/datafusion/sqllogictest/test_files/dates.slt index 2e91a0363db0..a309be114809 100644 --- a/datafusion/sqllogictest/test_files/dates.slt +++ b/datafusion/sqllogictest/test_files/dates.slt @@ -85,9 +85,14 @@ g h ## Plan error when compare Utf8 and timestamp in where clause -statement error DataFusion error: type_coercion\ncaused by\nError during planning: Cannot coerce arithmetic expression Timestamp\(Nanosecond, Some\("\+00:00"\)\) \+ Utf8 to valid types +statement error select i_item_desc from test where d3_date > now() + '5 days'; +---- +DataFusion error: type_coercion +caused by +Error during planning: Cannot coerce arithmetic expression Timestamp(ns, "+00:00") + Utf8 to valid types + # DATE minus DATE # https://github.com/apache/arrow-rs/issues/4383 diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index bc6cbfab0cae..64c78284594f 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -867,7 +867,7 @@ query TTTTTT show columns FROM table_with_pk; ---- datafusion public table_with_pk sn Int32 NO -datafusion public table_with_pk ts Timestamp(Nanosecond, Some("+00:00")) NO +datafusion public table_with_pk ts Timestamp(ns, "+00:00") NO datafusion public table_with_pk currency Utf8View NO datafusion public table_with_pk amount Float32 YES diff --git a/datafusion/sqllogictest/test_files/describe.slt b/datafusion/sqllogictest/test_files/describe.slt index de5208b5483a..88347965c67a 100644 --- a/datafusion/sqllogictest/test_files/describe.slt +++ b/datafusion/sqllogictest/test_files/describe.slt @@ -83,7 +83,7 @@ float_col Float32 YES double_col Float64 YES date_string_col Utf8View YES string_col Utf8View YES -timestamp_col Timestamp(Nanosecond, None) YES +timestamp_col Timestamp(ns) YES year Int32 YES month Int32 YES @@ -116,3 +116,29 @@ col1 Int32 YES # Test error cases statement error DESC nonexistent_table; + +########## +# Describe statement +########## + +# Test describing the schema of a simple statement +query TTT +DESCRIBE SELECT 1; +---- +Int64(1) Int64 NO + +# Insert some data into the existing test table... +statement ok +INSERT INTO test_desc_table (id, name) VALUES (1, 'Alice'), (2, 'Bob'), (3, 'Charlie'), (4, 'Alice'); + +# ... and describe the schema of a more complex query +query TTT +DESCRIBE SELECT name, COUNT(*) AS name_count FROM test_desc_table + GROUP BY name HAVING COUNT(*) > 1 ORDER BY name_count DESC; +---- +name Utf8View YES +name_count Int64 NO + +# Describing a statement that's not a query is not supported +statement error Describing statements other than SELECT not supported +DESCRIBE CREATE TABLE test_desc_table (id INT, name VARCHAR); diff --git a/datafusion/sqllogictest/test_files/dictionary.slt b/datafusion/sqllogictest/test_files/dictionary.slt index 9e8a39494095..fd9a7fb9ce44 100644 --- a/datafusion/sqllogictest/test_files/dictionary.slt +++ b/datafusion/sqllogictest/test_files/dictionary.slt @@ -85,7 +85,7 @@ f1 Float64 YES f2 Utf8 YES f3 Utf8 YES f4 Float64 YES -time Timestamp(Nanosecond, None) YES +time Timestamp(ns) YES # in list with dictionary input query BBB @@ -157,7 +157,7 @@ DESCRIBE m2; type Dictionary(Int32, Utf8) YES tag_id Dictionary(Int32, Utf8) YES f5 Float64 YES -time Timestamp(Nanosecond, None) YES +time Timestamp(ns) YES query I select count(*) from m1 where tag_id = '1000' and time < '2024-01-03T14:46:35+01:00'; diff --git a/datafusion/sqllogictest/test_files/encoding.slt b/datafusion/sqllogictest/test_files/encoding.slt index 960e81f4d14c..300294f6e115 100644 --- a/datafusion/sqllogictest/test_files/encoding.slt +++ b/datafusion/sqllogictest/test_files/encoding.slt @@ -23,7 +23,7 @@ CREATE TABLE test( hex_field TEXT ) as VALUES (0, 'abc', encode('abc', 'base64'), encode('abc', 'hex')), - (1, 'qweqwe', encode('qweqwe', 'base64'), encode('qweqwe', 'hex')), + (1, 'qweqw', encode('qweqw', 'base64') || '=', encode('qweqw', 'hex')), (2, NULL, NULL, NULL), (3, X'8f50d3f60eae370ddbf85c86219c55108a350165', encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'hex')) ; @@ -52,7 +52,7 @@ query T SELECT encode(bin_field, 'hex') FROM test ORDER BY num; ---- 616263 -717765717765 +7177657177 NULL 8f50d3f60eae370ddbf85c86219c55108a350165 @@ -60,7 +60,7 @@ query T SELECT arrow_cast(decode(base64_field, 'base64'), 'Utf8') FROM test ORDER BY num; ---- abc -qweqwe +qweqw NULL 8f50d3f60eae370ddbf85c86219c55108a350165 @@ -68,7 +68,7 @@ query T SELECT arrow_cast(decode(hex_field, 'hex'), 'Utf8') FROM test ORDER BY num; ---- abc -qweqwe +qweqw NULL 8f50d3f60eae370ddbf85c86219c55108a350165 @@ -110,7 +110,6 @@ SELECT column1_utf8view, encode(column1_utf8view, 'base64') AS column1_base64, encode(column1_utf8view, 'hex') AS column1_hex, - column2_utf8view, encode(column2_utf8view, 'base64') AS column2_base64, encode(column2_utf8view, 'hex') AS column2_hex diff --git a/datafusion/sqllogictest/test_files/expr/date_part.slt b/datafusion/sqllogictest/test_files/expr/date_part.slt index 64f16f72421a..bee8602d80bd 100644 --- a/datafusion/sqllogictest/test_files/expr/date_part.slt +++ b/datafusion/sqllogictest/test_files/expr/date_part.slt @@ -1005,10 +1005,10 @@ SELECT extract(day from arrow_cast(864000, 'Duration(Second)')) ---- 10 -query error DataFusion error: Arrow error: Compute error: Month does not support: Duration\(Second\) +query error DataFusion error: Arrow error: Compute error: Month does not support: Duration\(s\) SELECT extract(month from arrow_cast(864000, 'Duration(Second)')) -query error DataFusion error: Arrow error: Compute error: Year does not support: Duration\(Second\) +query error DataFusion error: Arrow error: Compute error: Year does not support: Duration\(s\) SELECT extract(year from arrow_cast(864000, 'Duration(Second)')) query I diff --git a/datafusion/sqllogictest/test_files/float16.slt b/datafusion/sqllogictest/test_files/float16.slt index 5e59c730f078..699eb81844a4 100644 --- a/datafusion/sqllogictest/test_files/float16.slt +++ b/datafusion/sqllogictest/test_files/float16.slt @@ -51,13 +51,14 @@ NULL NULL NULL NULL NULL NULL NaN NaN NaN NaN NaN NaN # Try coercing with literal NULL -query error +query R select column1 + NULL from float16s; ---- -DataFusion error: type_coercion -caused by -Error during planning: Cannot automatically convert Null to Float16 - +NULL +NULL +NULL +NULL +NULL # Test coercions with equality query BBBBBB @@ -78,11 +79,14 @@ false false false false false false # Try coercing with literal NULL -query error +query B select column1 = NULL from float16s; ---- -DataFusion error: Error during planning: Cannot infer common argument type for comparison operation Float16 = Null - +NULL +NULL +NULL +NULL +NULL # Cleanup statement ok diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index b72f73d44698..08636b482e38 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -3646,7 +3646,7 @@ physical_plan 07)------------AggregateExec: mode=Partial, gby=[sn@2 as sn, zip_code@0 as zip_code, country@1 as country, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum_amount@6 as sum_amount], aggr=[] 08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 09)----------------ProjectionExec: expr=[zip_code@0 as zip_code, country@1 as country, sn@2 as sn, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@6 as sum_amount] -10)------------------BoundedWindowAggExec: wdw=[sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +10)------------------BoundedWindowAggExec: wdw=[sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 11)--------------------DataSourceExec: partitions=1, partition_sizes=[2] @@ -3943,7 +3943,7 @@ physical_plan 04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true 06)--------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n] -07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 08)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true # reset partition number to 8. @@ -4065,7 +4065,7 @@ logical_plan 05)--------TableScan: multiple_ordered_table_with_pk projection=[b, c, d] physical_plan 01)ProjectionExec: expr=[c@0 as c, sum1@2 as sum1, sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as sumb] -02)--WindowAggExec: wdw=[sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +02)--WindowAggExec: wdw=[sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 03)----ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1] 04)------AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0]) 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index b15ec026372d..c67405715149 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -246,7 +246,7 @@ datafusion.execution.parquet.max_predicate_cache_size NULL datafusion.execution.parquet.max_row_group_size 1048576 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 -datafusion.execution.parquet.metadata_size_hint NULL +datafusion.execution.parquet.metadata_size_hint 524288 datafusion.execution.parquet.pruning true datafusion.execution.parquet.pushdown_filters false datafusion.execution.parquet.reorder_filters false @@ -366,7 +366,7 @@ datafusion.execution.parquet.max_predicate_cache_size NULL (reading) The maximum datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. -datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer +datafusion.execution.parquet.metadata_size_hint 524288 (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed. datafusion.execution.parquet.pruning true (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query @@ -705,6 +705,54 @@ SHOW CREATE TABLE abc; ---- datafusion public abc CREATE EXTERNAL TABLE abc STORED AS CSV LOCATION ../../testing/data/csv/aggregate_test_100.csv +# show_external_create_table_with_order +statement ok +CREATE EXTERNAL TABLE abc_ordered +STORED AS CSV +WITH ORDER (c1) +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); + +query TTTT +SHOW CREATE TABLE abc_ordered; +---- +datafusion public abc_ordered CREATE EXTERNAL TABLE abc_ordered STORED AS CSV WITH ORDER (c1) LOCATION ../../testing/data/csv/aggregate_test_100.csv + +statement ok +DROP TABLE abc_ordered; + +# show_external_create_table_with_multiple_order_columns +statement ok +CREATE EXTERNAL TABLE abc_multi_order +STORED AS CSV +WITH ORDER (c1, c2 DESC) +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); + +query TTTT +SHOW CREATE TABLE abc_multi_order; +---- +datafusion public abc_multi_order CREATE EXTERNAL TABLE abc_multi_order STORED AS CSV WITH ORDER (c1, c2 DESC) LOCATION ../../testing/data/csv/aggregate_test_100.csv + +statement ok +DROP TABLE abc_multi_order; + +# show_external_create_table_with_order_nulls +statement ok +CREATE EXTERNAL TABLE abc_order_nulls +STORED AS CSV +WITH ORDER (c1 NULLS LAST, c2 DESC NULLS FIRST) +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); + +query TTTT +SHOW CREATE TABLE abc_order_nulls; +---- +datafusion public abc_order_nulls CREATE EXTERNAL TABLE abc_order_nulls STORED AS CSV WITH ORDER (c1 NULLS LAST, c2 DESC NULLS FIRST) LOCATION ../../testing/data/csv/aggregate_test_100.csv + +statement ok +DROP TABLE abc_order_nulls; + # string_agg has different arg_types but same return type. Test avoiding duplicate entries for the same function. query TTT select routine_name, data_type, function_type from information_schema.routines where routine_name = 'string_agg'; diff --git a/datafusion/sqllogictest/test_files/information_schema_columns.slt b/datafusion/sqllogictest/test_files/information_schema_columns.slt index d348a764fa85..c733b3baa7a4 100644 --- a/datafusion/sqllogictest/test_files/information_schema_columns.slt +++ b/datafusion/sqllogictest/test_files/information_schema_columns.slt @@ -42,7 +42,7 @@ my_catalog my_schema table_with_many_types float64_col 1 NULL YES Float64 NULL N my_catalog my_schema table_with_many_types int32_col 0 NULL NO Int32 NULL NULL 32 2 NULL NULL NULL my_catalog my_schema table_with_many_types large_binary_col 5 NULL NO LargeBinary NULL 9223372036854775807 NULL NULL NULL NULL NULL my_catalog my_schema table_with_many_types large_utf8_col 3 NULL NO LargeUtf8 NULL 9223372036854775807 NULL NULL NULL NULL NULL -my_catalog my_schema table_with_many_types timestamp_nanos 6 NULL NO Timestamp(Nanosecond, None) NULL NULL NULL NULL NULL NULL NULL +my_catalog my_schema table_with_many_types timestamp_nanos 6 NULL NO Timestamp(ns) NULL NULL NULL NULL NULL NULL NULL my_catalog my_schema table_with_many_types utf8_col 2 NULL YES Utf8 NULL 2147483647 NULL NULL NULL NULL NULL # Cleanup diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index 9a3c959884aa..b8b2a7c37276 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -68,7 +68,7 @@ physical_plan 02)--ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@0 as field1, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@1 as field2] 03)----SortPreservingMergeExec: [c1@2 ASC NULLS LAST] 04)------ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, c1@0 as c1] -05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=8192 08)--------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8 @@ -128,7 +128,7 @@ physical_plan 01)DataSinkExec: sink=MemoryTable (partitions=1) 02)--CoalescePartitionsExec 03)----ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as field1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as field2] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true] 06)----------CoalesceBatchesExec: target_batch_size=8192 07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8 @@ -179,7 +179,7 @@ physical_plan 02)--ProjectionExec: expr=[a1@0 as a1, a2@1 as a2] 03)----SortPreservingMergeExec: [c1@2 ASC NULLS LAST] 04)------ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as a1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as a2, c1@0 as c1] -05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=8192 08)--------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8 diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt index 075256ae4b92..dc8ef59bbedc 100644 --- a/datafusion/sqllogictest/test_files/insert_to_external.slt +++ b/datafusion/sqllogictest/test_files/insert_to_external.slt @@ -422,7 +422,7 @@ physical_plan 02)--ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@0 as field1, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@1 as field2] 03)----SortPreservingMergeExec: [c1@2 ASC NULLS LAST] 04)------ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, c1@0 as c1] -05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=8192 08)--------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8 @@ -483,7 +483,7 @@ physical_plan 01)DataSinkExec: sink=ParquetSink(file_groups=[]) 02)--CoalescePartitionsExec 03)----ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as field1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as field2] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true] 06)----------CoalesceBatchesExec: target_batch_size=8192 07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8 diff --git a/datafusion/sqllogictest/test_files/interval.slt b/datafusion/sqllogictest/test_files/interval.slt index 1ef3048ddc66..8c5a4382ed2c 100644 --- a/datafusion/sqllogictest/test_files/interval.slt +++ b/datafusion/sqllogictest/test_files/interval.slt @@ -444,7 +444,7 @@ select '1 month'::interval + '1980-01-01T12:00:00'::timestamp; query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Date32 to valid types select '1 month'::interval - '1980-01-01'::date; -query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types +query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(ns\) to valid types select '1 month'::interval - '1980-01-01T12:00:00'::timestamp; # interval (array) + date / timestamp (array) @@ -466,7 +466,7 @@ select i + ts from t; query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Date32 to valid types select i - d from t; -query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types +query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(ns\) to valid types select i - ts from t; # interval unit abreiviation and plurals @@ -530,7 +530,7 @@ SELECT interval '5 day' hour query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Date32 to valid types select '1 month'::interval - d from t; -query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types +query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(ns\) to valid types select '1 month'::interval - ts from t; # interval + date diff --git a/datafusion/sqllogictest/test_files/join_lists.slt b/datafusion/sqllogictest/test_files/join_lists.slt index c07bd85551f3..0a48a4f9203e 100644 --- a/datafusion/sqllogictest/test_files/join_lists.slt +++ b/datafusion/sqllogictest/test_files/join_lists.slt @@ -60,4 +60,3 @@ DROP TABLE categories_raw; statement ok DROP TABLE places; - diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 0174321dd831..4bdf2e5da963 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -3199,7 +3199,7 @@ physical_plan 04)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, rn1@5 ASC NULLS LAST 05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 06)----------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -07)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +07)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true 09)----CoalesceBatchesExec: target_batch_size=2 10)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST @@ -3237,7 +3237,7 @@ physical_plan 08)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, rn1@5 ASC NULLS LAST 09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 10)----------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -11)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +11)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 12)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true statement ok @@ -3276,14 +3276,14 @@ physical_plan 06)----------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2 07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 08)--------------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -09)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +09)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true 11)------SortExec: expr=[a@1 ASC], preserve_partitioning=[true] 12)--------CoalesceBatchesExec: target_batch_size=2 13)----------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2 14)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 15)--------------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -16)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +16)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 17)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true statement ok @@ -3318,7 +3318,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@1, a@1)] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true 04)----ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true # hash join should propagate ordering equivalence of the right side for RIGHT ANTI join. @@ -3345,7 +3345,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(a@0, a@1)] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a], output_ordering=[a@0 ASC], file_type=csv, has_header=true 04)----ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true # Test ordering preservation for RIGHT join @@ -3441,7 +3441,7 @@ physical_plan 04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true 06)--------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n] -07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 08)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true # run query above in multiple partitions @@ -4036,12 +4036,12 @@ logical_plan 09)------------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)))|depth=1] structs[] 10)--------------Projection: generate_series(Int64(1), CAST(outer_ref(t1.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int))) 11)----------------EmptyRelation: rows=1 -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "t1_int", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "t1" }), name: "t1_int" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "t1_int", data_type: UInt32, nullable: true }, Column { relation: Some(Bare { table: "t1" }), name: "t1_int" }) # Test CROSS JOIN LATERAL syntax (execution) # TODO: https://github.com/apache/datafusion/issues/10048 -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "t1_int", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}, Column \{ relation: Some\(Bare \{ table: "t1" \}\), name: "t1_int" \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "t1_int", data_type: UInt32, nullable: true \}, Column \{ relation: Some\(Bare \{ table: "t1" \}\), name: "t1_int" \}\) select t1_id, t1_name, i from join_t1 t1 cross join lateral (select * from unnest(generate_series(1, t1_int))) as series(i); @@ -4061,12 +4061,12 @@ logical_plan 09)------------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)))|depth=1] structs[] 10)--------------Projection: generate_series(Int64(1), CAST(outer_ref(t2.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int))) 11)----------------EmptyRelation: rows=1 -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "t1_int", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "t2" }), name: "t1_int" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "t1_int", data_type: UInt32, nullable: true }, Column { relation: Some(Bare { table: "t2" }), name: "t1_int" }) # Test INNER JOIN LATERAL syntax (execution) # TODO: https://github.com/apache/datafusion/issues/10048 -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "t1_int", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}, Column \{ relation: Some\(Bare \{ table: "t2" \}\), name: "t1_int" \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "t1_int", data_type: UInt32, nullable: true \}, Column \{ relation: Some\(Bare \{ table: "t2" \}\), name: "t1_int" \}\) select t1_id, t1_name, i from join_t1 t2 inner join lateral (select * from unnest(generate_series(1, t1_int))) as series(i) on(t1_id > i); # Test RIGHT JOIN LATERAL syntax (unsupported) @@ -4671,7 +4671,7 @@ logical_plan 05)------Subquery: 06)--------Filter: outer_ref(j1.j1_id) < j2.j2_id 07)----------TableScan: j2 projection=[j2_string, j2_id] -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) query TT explain SELECT * FROM j1 JOIN (j2 JOIN j3 ON(j2_id = j3_id - 2)) ON(j1_id = j2_id), LATERAL (SELECT * FROM j3 WHERE j3_string = j2_string) as j4 @@ -4687,7 +4687,7 @@ logical_plan 08)----Subquery: 09)------Filter: j3.j3_string = outer_ref(j2.j2_string) 10)--------TableScan: j3 projection=[j3_string, j3_id] -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j2_string", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "j2" }), name: "j2_string" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j2_string", data_type: Utf8View, nullable: true }, Column { relation: Some(Bare { table: "j2" }), name: "j2_string" }) query TT explain SELECT * FROM j1, LATERAL (SELECT * FROM j1, LATERAL (SELECT * FROM j2 WHERE j1_id = j2_id) as j2) as j2; @@ -4703,7 +4703,7 @@ logical_plan 08)----------Subquery: 09)------------Filter: outer_ref(j1.j1_id) = j2.j2_id 10)--------------TableScan: j2 projection=[j2_string, j2_id] -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) query TT explain SELECT j1_string, j2_string FROM j1 LEFT JOIN LATERAL (SELECT * FROM j2 WHERE j1_id < j2_id) AS j2 ON(true); @@ -4716,7 +4716,7 @@ logical_plan 05)------Subquery: 06)--------Filter: outer_ref(j1.j1_id) < j2.j2_id 07)----------TableScan: j2 projection=[j2_string, j2_id] -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) query TT explain SELECT * FROM j1, (j2 LEFT JOIN LATERAL (SELECT * FROM j3 WHERE j1_id + j2_id = j3_id) AS j3 ON(true)); @@ -4730,7 +4730,7 @@ logical_plan 06)------Subquery: 07)--------Filter: outer_ref(j1.j1_id) + outer_ref(j2.j2_id) = j3.j3_id 08)----------TableScan: j3 projection=[j3_string, j3_id] -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) query TT explain SELECT * FROM j1, LATERAL (SELECT 1) AS j2; diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt index 4f1e5ef39a00..fc21638b3f3c 100644 --- a/datafusion/sqllogictest/test_files/map.slt +++ b/datafusion/sqllogictest/test_files/map.slt @@ -43,8 +43,8 @@ LOCATION '../core/tests/data/parquet_map.parquet'; query TTT describe data; ---- -ints Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO -strings Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO +ints Map("entries": Struct("key": Utf8, "value": Int64), unsorted) NO +strings Map("entries": Struct("key": Utf8, "value": Utf8), unsorted) NO timestamp Utf8View NO query ??T diff --git a/datafusion/sqllogictest/test_files/math.slt b/datafusion/sqllogictest/test_files/math.slt index e206aa16b8a9..1cb68b85b2bc 100644 --- a/datafusion/sqllogictest/test_files/math.slt +++ b/datafusion/sqllogictest/test_files/math.slt @@ -139,16 +139,16 @@ select abs(arrow_cast('-1.2', 'Utf8')); statement ok CREATE TABLE test_nullable_integer( - c1 TINYINT, - c2 SMALLINT, - c3 INT, - c4 BIGINT, - c5 TINYINT UNSIGNED, - c6 SMALLINT UNSIGNED, - c7 INT UNSIGNED, - c8 BIGINT UNSIGNED, + c1 TINYINT, + c2 SMALLINT, + c3 INT, + c4 BIGINT, + c5 TINYINT UNSIGNED, + c6 SMALLINT UNSIGNED, + c7 INT UNSIGNED, + c8 BIGINT UNSIGNED, dataset TEXT - ) + ) AS VALUES (NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'nulls'), (0, 0, 0, 0, 0, 0, 0, 0, 'zeros'), @@ -237,7 +237,7 @@ SELECT c8%0 FROM test_nullable_integer # abs: return type query TTTTTTTT rowsort -select +select arrow_typeof(abs(c1)), arrow_typeof(abs(c2)), arrow_typeof(abs(c3)), arrow_typeof(abs(c4)), arrow_typeof(abs(c5)), arrow_typeof(abs(c6)), arrow_typeof(abs(c7)), arrow_typeof(abs(c8)) from test_nullable_integer limit 1 @@ -285,13 +285,13 @@ drop table test_nullable_integer statement ok CREATE TABLE test_non_nullable_integer( - c1 TINYINT NOT NULL, - c2 SMALLINT NOT NULL, - c3 INT NOT NULL, - c4 BIGINT NOT NULL, - c5 TINYINT UNSIGNED NOT NULL, - c6 SMALLINT UNSIGNED NOT NULL, - c7 INT UNSIGNED NOT NULL, + c1 TINYINT NOT NULL, + c2 SMALLINT NOT NULL, + c3 INT NOT NULL, + c4 BIGINT NOT NULL, + c5 TINYINT UNSIGNED NOT NULL, + c6 SMALLINT UNSIGNED NOT NULL, + c7 INT UNSIGNED NOT NULL, c8 BIGINT UNSIGNED NOT NULL ); @@ -363,7 +363,7 @@ CREATE TABLE test_nullable_float( c2 double ) AS VALUES (-1.0, -1.0), - (1.0, 1.0), + (1.0, 1.0), (NULL, NULL), (0., 0.), ('NaN'::double, 'NaN'::double); @@ -412,7 +412,7 @@ Float32 Float64 # abs: floats query RR rowsort -SELECT abs(c1), abs(c2) from test_nullable_float +SELECT abs(c1), abs(c2) from test_nullable_float ---- 0 0 1 1 @@ -420,6 +420,17 @@ SELECT abs(c1), abs(c2) from test_nullable_float NULL NULL NaN NaN +# f16 +query TR rowsort +SELECT arrow_typeof(abs(arrow_cast(c1, 'Float16'))), abs(arrow_cast(c1, 'Float16')) +FROM test_nullable_float +---- +Float16 0 +Float16 1 +Float16 1 +Float16 NULL +Float16 NaN + statement ok drop table test_nullable_float @@ -428,7 +439,7 @@ statement ok CREATE TABLE test_non_nullable_float( c1 float NOT NULL, c2 double NOT NULL - ); + ); query I INSERT INTO test_non_nullable_float VALUES @@ -478,27 +489,27 @@ drop table test_non_nullable_float statement ok CREATE TABLE test_nullable_decimal( c1 DECIMAL(10, 2), /* Decimal128 */ - c2 DECIMAL(38, 10), /* Decimal128 with max precision */ + c2 DECIMAL(38, 10), /* Decimal128 with max precision */ c3 DECIMAL(40, 2), /* Decimal256 */ - c4 DECIMAL(76, 10) /* Decimal256 with max precision */ - ) AS VALUES - (0, 0, 0, 0), + c4 DECIMAL(76, 10) /* Decimal256 with max precision */ + ) AS VALUES + (0, 0, 0, 0), (NULL, NULL, NULL, NULL); query I INSERT into test_nullable_decimal values ( - -99999999.99, - '-9999999999999999999999999999.9999999999', - '-99999999999999999999999999999999999999.99', + -99999999.99, + '-9999999999999999999999999999.9999999999', + '-99999999999999999999999999999999999999.99', '-999999999999999999999999999999999999999999999999999999999999999999.9999999999' - ), + ), ( - 99999999.99, - '9999999999999999999999999999.9999999999', - '99999999999999999999999999999999999999.99', + 99999999.99, + '9999999999999999999999999999.9999999999', + '99999999999999999999999999999999999999.99', '999999999999999999999999999999999999999999999999999999999999999999.9999999999' - ) + ) ---- 2 @@ -533,9 +544,9 @@ SELECT c1%0 FROM test_nullable_decimal WHERE c1 IS NOT NULL; # abs: return type query TTTT -SELECT - arrow_typeof(abs(c1)), - arrow_typeof(abs(c2)), +SELECT + arrow_typeof(abs(c1)), + arrow_typeof(abs(c2)), arrow_typeof(abs(c3)), arrow_typeof(abs(c4)) FROM test_nullable_decimal limit 1 @@ -552,11 +563,11 @@ SELECT abs(c1), abs(c2), abs(c3), abs(c4) FROM test_nullable_decimal NULL NULL NULL NULL statement ok -drop table test_nullable_decimal +drop table test_nullable_decimal statement ok -CREATE TABLE test_non_nullable_decimal(c1 DECIMAL(9,2) NOT NULL); +CREATE TABLE test_non_nullable_decimal(c1 DECIMAL(9,2) NOT NULL); query I INSERT INTO test_non_nullable_decimal VALUES(1) @@ -569,13 +580,13 @@ SELECT c1*0 FROM test_non_nullable_decimal 0 query error DataFusion error: Arrow error: Divide by zero error -SELECT c1/0 FROM test_non_nullable_decimal +SELECT c1/0 FROM test_non_nullable_decimal query error DataFusion error: Arrow error: Divide by zero error -SELECT c1%0 FROM test_non_nullable_decimal +SELECT c1%0 FROM test_non_nullable_decimal statement ok -drop table test_non_nullable_decimal +drop table test_non_nullable_decimal statement ok CREATE TABLE signed_integers( @@ -615,7 +626,7 @@ NULL NULL NULL # scalar maxes and/or negative 1 query III -select +select gcd(9223372036854775807, -9223372036854775808), -- i64::MAX, i64::MIN gcd(9223372036854775807, -1), -- i64::MAX, -1 gcd(-9223372036854775808, -1); -- i64::MIN, -1 diff --git a/datafusion/sqllogictest/test_files/named_arguments.slt b/datafusion/sqllogictest/test_files/named_arguments.slt new file mode 100644 index 000000000000..c93da7e7a8f9 --- /dev/null +++ b/datafusion/sqllogictest/test_files/named_arguments.slt @@ -0,0 +1,139 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############# +## Tests for Named Arguments (PostgreSQL-style param => value syntax) +############# + +# Test positional arguments still work (baseline) +query T +SELECT substr('hello world', 7, 5); +---- +world + +# Test named arguments in order +query T +SELECT substr(str => 'hello world', start_pos => 7, length => 5); +---- +world + +# Test named arguments out of order +query T +SELECT substr(length => 5, str => 'hello world', start_pos => 7); +---- +world + +# Test mixed positional and named arguments +query T +SELECT substr('hello world', start_pos => 7, length => 5); +---- +world + +# Test with only 2 parameters (length optional) +query T +SELECT substr(str => 'hello world', start_pos => 7); +---- +world + +# Test all parameters named with substring alias +query T +SELECT substring(str => 'hello', start_pos => 1, length => 3); +---- +hel + +# Error: positional argument after named argument +query error DataFusion error: Error during planning: Positional argument.*follows named argument +SELECT substr(str => 'hello', 1, 3); + +# Error: unknown parameter name +query error DataFusion error: Error during planning: Unknown parameter name 'invalid' +SELECT substr(invalid => 'hello', start_pos => 1, length => 3); + +# Error: duplicate parameter name +query error DataFusion error: Error during planning: Parameter 'str' specified multiple times +SELECT substr(str => 'hello', str => 'world', start_pos => 1); + +# Test case-insensitive parameter names (unquoted identifiers) +query T +SELECT substr(STR => 'hello world', START_POS => 7, LENGTH => 5); +---- +world + +# Test case-insensitive with mixed case +query T +SELECT substr(Str => 'hello world', Start_Pos => 7); +---- +world + +# Error: case-sensitive quoted parameter names don't match +query error DataFusion error: Error during planning: Unknown parameter name 'STR' +SELECT substr("STR" => 'hello world', "start_pos" => 7); + +# Error: wrong number of arguments +# This query provides only 1 argument but substr requires 2 or 3 +query error DataFusion error: Error during planning: Execution error: Function 'substr' user-defined coercion failed with "Error during planning: The substr function requires 2 or 3 arguments, but got 1." +SELECT substr(str => 'hello world'); + +############# +## PostgreSQL Dialect Tests (uses ExprNamed variant) +############# + +statement ok +set datafusion.sql_parser.dialect = 'PostgreSQL'; + +# Test named arguments in order +query T +SELECT substr(str => 'hello world', start_pos => 7, length => 5); +---- +world + +# Test named arguments out of order +query T +SELECT substr(length => 5, str => 'hello world', start_pos => 7); +---- +world + +# Test mixed positional and named arguments +query T +SELECT substr('hello world', start_pos => 7, length => 5); +---- +world + +# Test with only 2 parameters (length optional) +query T +SELECT substr(str => 'hello world', start_pos => 7); +---- +world + +# Reset to default dialect +statement ok +set datafusion.sql_parser.dialect = 'Generic'; + +############# +## MsSQL Dialect Tests (does NOT support => operator) +############# + +statement ok +set datafusion.sql_parser.dialect = 'MsSQL'; + +# Error: MsSQL dialect does not support => operator +query error DataFusion error: SQL error: ParserError\("Expected: \), found: => at Line: 1, Column: 19"\) +SELECT substr(str => 'hello world', start_pos => 7, length => 5); + +# Reset to default dialect +statement ok +set datafusion.sql_parser.dialect = 'Generic'; diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index e722005bf0f0..c21f3129d4ee 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -268,7 +268,7 @@ FROM ( ) t GROUP BY 1 ---- -Timestamp(Millisecond, Some("UTC")) 2014-08-27T14:00:00Z 131072 +Timestamp(ms, "UTC") 2014-08-27T14:00:00Z 131072 # Test config listing_table_ignore_subdirectory: @@ -689,7 +689,7 @@ LOCATION '../../parquet-testing/data/int96_from_spark.parquet'; query TTT describe int96_from_spark ---- -a Timestamp(Nanosecond, None) YES +a Timestamp(ns) YES # Note that the values are read as nanosecond precision query P @@ -718,7 +718,7 @@ LOCATION '../../parquet-testing/data/int96_from_spark.parquet'; query TTT describe int96_from_spark; ---- -a Timestamp(Millisecond, None) YES +a Timestamp(ms) YES # Per https://github.com/apache/parquet-testing/blob/6e851ddd768d6af741c7b15dc594874399fc3cff/data/int96_from_spark.md?plain=1#L37 # these values should be @@ -742,7 +742,7 @@ select * from int96_from_spark 9999-12-31T03:00:00 2024-12-30T23:00:00 NULL -ERROR: Cast error: Failed to convert -9357363680509551 to datetime for Timestamp(Millisecond, None) +ERROR: Cast error: Failed to convert -9357363680509551 to datetime for Timestamp(ms) # Cleanup / reset default setting statement ok @@ -862,3 +862,30 @@ select part, k, v from t order by k statement ok DROP TABLE t; + +# Regression test for files with stats on some columns and not others +# See https://github.com/apache/datafusion/pull/18276 + +query I +COPY (SELECT 1::int AS a, 2::int as b) +TO 'test_files/scratch/parquet/mixed_stats.parquet' +STORED AS PARQUET OPTIONS ( + 'STATISTICS_ENABLED::b' 'none' +); +---- +1 + +statement ok +CREATE EXTERNAL TABLE t +STORED AS PARQUET +LOCATION 'test_files/scratch/parquet/mixed_stats.parquet'; + +query I +SELECT b +FROM t +WHERE b = 2; +---- +2 + +statement ok +DROP TABLE t; diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt index 6dc2c264aeb8..e4676ae5332d 100644 --- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt +++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt @@ -474,10 +474,7 @@ EXPLAIN select * from t_pushdown where part != val logical_plan 01)Filter: t_pushdown.val != t_pushdown.part 02)--TableScan: t_pushdown projection=[val, part], partial_filters=[t_pushdown.val != t_pushdown.part] -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: val@0 != part@1 -03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet +physical_plan DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != part@1 # If we reference only a partition column it gets evaluated during the listing phase query TT @@ -505,11 +502,7 @@ EXPLAIN select * from t_pushdown where val != 'd' AND val != 'c' AND part = 'a' logical_plan 01)Filter: t_pushdown.val != Utf8View("d") AND t_pushdown.val != Utf8View("c") AND t_pushdown.val != t_pushdown.part 02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val != Utf8View("d"), t_pushdown.val != Utf8View("c"), t_pushdown.val != t_pushdown.part] -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: val@0 != part@1 -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != d AND val@0 != c, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != d OR d != val_max@1) AND val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c, d)] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != d AND val@0 != c AND val@0 != part@1, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != d OR d != val_max@1) AND val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c, d)] # The order of filters should not matter query TT @@ -518,10 +511,7 @@ EXPLAIN select val, part from t_pushdown where part = 'a' AND part = val; logical_plan 01)Filter: t_pushdown.val = t_pushdown.part 02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val = t_pushdown.part] -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: val@0 = part@1 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 = part@1 query TT select val, part from t_pushdown where part = 'a' AND part = val; @@ -534,10 +524,7 @@ EXPLAIN select val, part from t_pushdown where part = val AND part = 'a'; logical_plan 01)Filter: t_pushdown.val = t_pushdown.part 02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val = t_pushdown.part] -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: val@0 = part@1 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 = part@1 query TT select val, part from t_pushdown where part = val AND part = 'a'; diff --git a/datafusion/sqllogictest/test_files/pwmj.slt b/datafusion/sqllogictest/test_files/pwmj.slt index 0014b3c545f2..eafa4d0ba394 100644 --- a/datafusion/sqllogictest/test_files/pwmj.slt +++ b/datafusion/sqllogictest/test_files/pwmj.slt @@ -158,7 +158,7 @@ ORDER BY 1,2; 33 44 44 55 -query TT +query TT EXPLAIN SELECT t1.t1_id, t2.t2_id FROM join_t1 t1 diff --git a/datafusion/sqllogictest/test_files/qualify.slt b/datafusion/sqllogictest/test_files/qualify.slt index d53b56ce58de..366d65df6792 100644 --- a/datafusion/sqllogictest/test_files/qualify.slt +++ b/datafusion/sqllogictest/test_files/qualify.slt @@ -275,7 +275,7 @@ physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 > 1, projection=[id@0, name@1] -04)------WindowAggExec: wdw=[count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +04)------WindowAggExec: wdw=[count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 05)--------DataSourceExec: partitions=1, partition_sizes=[1] # plan row_number() @@ -293,7 +293,7 @@ physical_plan 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@0 > 1 04)------ProjectionExec: expr=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] -05)--------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 06)----------SortExec: expr=[dept@0 ASC NULLS LAST], preserve_partitioning=[false] 07)------------DataSourceExec: partitions=1, partition_sizes=[1] @@ -321,7 +321,7 @@ physical_plan 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 > Some(60000000000),14,6 04)------ProjectionExec: expr=[dept@0 as dept, avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] -05)--------WindowAggExec: wdw=[avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Decimal128(14, 6), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +05)--------WindowAggExec: wdw=[avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Decimal128(14, 6), nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 06)----------SortExec: expr=[dept@0 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=8192 08)--------------RepartitionExec: partitioning=Hash([dept@0], 4), input_partitions=4 @@ -358,7 +358,7 @@ physical_plan 04)------CoalesceBatchesExec: target_batch_size=8192 05)--------FilterExec: rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 = 1, projection=[dept@0, sum(users.salary)@1] 06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -07)------------BoundedWindowAggExec: wdw=[rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +07)------------BoundedWindowAggExec: wdw=[rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 08)--------------SortPreservingMergeExec: [sum(users.salary)@1 DESC] 09)----------------SortExec: expr=[sum(users.salary)@1 DESC], preserve_partitioning=[true] 10)------------------AggregateExec: mode=FinalPartitioned, gby=[dept@0 as dept], aggr=[sum(users.salary)] diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index b0e200015dfd..faa0d69ae84b 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -309,6 +309,14 @@ select ceil(a), ceil(b), ceil(c) from small_floats; 1 0 0 1 0 1 +# ceil with scale parameter(Scale not supported) +query error DataFusion error: This feature is not implemented: CEIL with scale is not supported +select ceil(100.1234, 1) + +# ceil with datetime parameter (not supported) +query error DataFusion error: This feature is not implemented: CEIL with datetime is not supported +select ceil(100.1234 to year) + ## degrees # degrees scalar function @@ -448,6 +456,14 @@ select floor(a), floor(b), floor(c) from signed_integers; 2 -1000 123 4 NULL NULL +# floor with scale parameter(Scale not supported) +query error DataFusion error: This feature is not implemented: FLOOR with scale is not supported +select floor(a, 1) + +# floor with datetime parameter ( not supported) +query error DataFusion error: This feature is not implemented: FLOOR with datetime is not supported +select floor(a to year) + ## ln # ln scalar function diff --git a/datafusion/sqllogictest/test_files/spark/array/shuffle.slt b/datafusion/sqllogictest/test_files/spark/array/shuffle.slt index cb3c77cac8fb..7614caef666b 100644 --- a/datafusion/sqllogictest/test_files/spark/array/shuffle.slt +++ b/datafusion/sqllogictest/test_files/spark/array/shuffle.slt @@ -109,5 +109,3 @@ DROP TABLE test_shuffle_list_types; statement ok DROP TABLE test_shuffle_fixed_size; - - diff --git a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt index 2789efef7bf3..39dca512226b 100644 --- a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt +++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt @@ -59,3 +59,35 @@ SELECT bitmap_count(arrow_cast(a, 'FixedSizeBinary(2)')) FROM (VALUES (X'1010'), 5 16 NULL + +query I +SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int32, Binary)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a); +---- +2 +5 +16 +NULL + +query I +SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int8, Binary)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a); +---- +2 +5 +16 +NULL + +query I +SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int16, Binary)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a); +---- +2 +5 +16 +NULL + +query I +SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int64, Binary)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a); +---- +2 +5 +16 +NULL diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index 95eeffc31903..0e3c5145d156 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -53,9 +53,9 @@ select * from struct_values; query TT select arrow_typeof(s1), arrow_typeof(s2) from struct_values; ---- -Struct(c0 Int32) Struct(a Int32, b Utf8View) -Struct(c0 Int32) Struct(a Int32, b Utf8View) -Struct(c0 Int32) Struct(a Int32, b Utf8View) +Struct("c0": nullable Int32) Struct("a": nullable Int32, "b": nullable Utf8View) +Struct("c0": nullable Int32) Struct("a": nullable Int32, "b": nullable Utf8View) +Struct("c0": nullable Int32) Struct("a": nullable Int32, "b": nullable Utf8View) # struct[i] @@ -229,12 +229,12 @@ select named_struct('field_a', 1, 'field_b', 2); query T select arrow_typeof(named_struct('first', 1, 'second', 2, 'third', 3)); ---- -Struct(first Int64, second Int64, third Int64) +Struct("first": nullable Int64, "second": nullable Int64, "third": nullable Int64) query T select arrow_typeof({'first': 1, 'second': 2, 'third': 3}); ---- -Struct(first Int64, second Int64, third Int64) +Struct("first": nullable Int64, "second": nullable Int64, "third": nullable Int64) # test nested struct literal query ? @@ -413,7 +413,7 @@ create table t(a struct, b struct) as valu query T select arrow_typeof([a, b]) from t; ---- -List(Field { name: "item", data_type: Struct([Field { name: "r", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c", data_type: Float32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable Struct("r": nullable Utf8View, "c": nullable Float32)) query ? select [a, b] from t; @@ -464,12 +464,12 @@ select * from t; query T select arrow_typeof(c1) from t; ---- -Struct(r Utf8View, b Int32) +Struct("r": nullable Utf8View, "b": nullable Int32) query T select arrow_typeof(c2) from t; ---- -Struct(r Utf8View, b Float32) +Struct("r": nullable Utf8View, "b": nullable Float32) statement ok drop table t; @@ -486,8 +486,8 @@ select * from t; query T select arrow_typeof(column1) from t; ---- -Struct(r Utf8, c Float64) -Struct(r Utf8, c Float64) +Struct("r": nullable Utf8, "c": nullable Float64) +Struct("r": nullable Utf8, "c": nullable Float64) statement ok drop table t; @@ -519,9 +519,9 @@ select coalesce(s1) from t; query T select arrow_typeof(coalesce(s1, s2)) from t; ---- -Struct(a Float32, b Utf8View) -Struct(a Float32, b Utf8View) -Struct(a Float32, b Utf8View) +Struct("a": nullable Float32, "b": nullable Utf8View) +Struct("a": nullable Float32, "b": nullable Utf8View) +Struct("a": nullable Float32, "b": nullable Utf8View) statement ok drop table t; @@ -546,9 +546,9 @@ select coalesce(s1, s2) from t; query T select arrow_typeof(coalesce(s1, s2)) from t; ---- -Struct(a Float32, b Utf8View) -Struct(a Float32, b Utf8View) -Struct(a Float32, b Utf8View) +Struct("a": nullable Float32, "b": nullable Utf8View) +Struct("a": nullable Float32, "b": nullable Utf8View) +Struct("a": nullable Float32, "b": nullable Utf8View) statement ok drop table t; @@ -583,7 +583,7 @@ create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as valu query T select arrow_typeof([a, b]) from t; ---- -List(Field { name: "item", data_type: Struct([Field { name: "r", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c", data_type: Float32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable Struct("r": nullable Utf8View, "c": nullable Float32)) statement ok drop table t; @@ -606,13 +606,13 @@ create table t(a struct(r varchar, c int, g float), b struct(r varchar, c float, query T select arrow_typeof(a) from t; ---- -Struct(r Utf8View, c Int32, g Float32) +Struct("r": nullable Utf8View, "c": nullable Int32, "g": nullable Float32) # type of each column should not coerced but perserve as it is query T select arrow_typeof(b) from t; ---- -Struct(r Utf8View, c Float32, g Int32) +Struct("r": nullable Utf8View, "c": nullable Float32, "g": nullable Int32) statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/subquery_sort.slt b/datafusion/sqllogictest/test_files/subquery_sort.slt index 1e5a3c8f526a..ea7addd8e36f 100644 --- a/datafusion/sqllogictest/test_files/subquery_sort.slt +++ b/datafusion/sqllogictest/test_files/subquery_sort.slt @@ -100,7 +100,7 @@ physical_plan 01)ProjectionExec: expr=[c1@0 as c1, r@1 as r] 02)--SortExec: TopK(fetch=2), expr=[c1@0 ASC NULLS LAST, c3@2 ASC NULLS LAST, c9@3 ASC NULLS LAST], preserve_partitioning=[false] 03)----ProjectionExec: expr=[c1@0 as c1, rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as r, c3@1 as c3, c9@2 as c9] -04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c1@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3, c9], file_type=csv, has_header=true @@ -126,7 +126,7 @@ physical_plan 01)ProjectionExec: expr=[c1@0 as c1, r@1 as r] 02)--SortExec: TopK(fetch=2), expr=[c1@0 ASC NULLS LAST, c3@2 ASC NULLS LAST, c9@3 ASC NULLS LAST], preserve_partitioning=[false] 03)----ProjectionExec: expr=[c1@0 as c1, rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as r, c3@1 as c3, c9@2 as c9] -04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c1@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: partitions=1, partition_sizes=[1] diff --git a/datafusion/sqllogictest/test_files/table_functions.slt b/datafusion/sqllogictest/test_files/table_functions.slt index 0159abe8d06b..1bffbc3b3a64 100644 --- a/datafusion/sqllogictest/test_files/table_functions.slt +++ b/datafusion/sqllogictest/test_files/table_functions.slt @@ -188,6 +188,21 @@ SELECT generate_series(1, t1.end) FROM generate_series(3, 5) as t1(end) [1, 2, 3, 4] [1, 2, 3] +# join with projection on generate_series +query I +select g1.value from generate_series(1, 3) g1 CROSS JOIN generate_series(1, 3) g2; +---- +1 +1 +1 +2 +2 +2 +3 +3 +3 + + # Test range table function query I SELECT * FROM range(6) diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 6fe9995c7b67..84dd7098a2ee 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -79,7 +79,7 @@ SET TIME ZONE = '+08' query T select arrow_typeof(now()); ---- -Timestamp(Nanosecond, Some("+08")) +Timestamp(ns, "+08") query I SELECT count(1) result FROM (SELECT now() as n) a WHERE n > '2000-01-01'::date; @@ -691,11 +691,11 @@ select ---- 08:09:10.123456789 13:14:15.123456 13:14:15.123 13:14:15 -query error Cannot cast string 'not a time' to value of Time64\(Nanosecond\) type +query error DataFusion error: Arrow error: Cast error: Cannot cast string 'not a time' to value of Time64\(ns\) type SELECT TIME 'not a time' as time; # invalid time -query error Cannot cast string '24:01:02' to value of Time64\(Nanosecond\) type +query error DataFusion error: Arrow error: Cast error: Cannot cast string '24:01:02' to value of Time64\(ns\) type SELECT TIME '24:01:02' as time; # invalid timezone @@ -908,7 +908,7 @@ from (values query T SELECT arrow_typeof(DATE_BIN(INTERVAL '15 minute', to_timestamp_micros(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z')) ---- -Timestamp(Microsecond, None) +Timestamp(µs) query P SELECT DATE_BIN(INTERVAL '15 minute', to_timestamp_millis(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z') @@ -926,7 +926,7 @@ from (values query T SELECT arrow_typeof(DATE_BIN(INTERVAL '15 minute', to_timestamp_millis(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z')) ---- -Timestamp(Millisecond, None) +Timestamp(ms) query P SELECT DATE_BIN(INTERVAL '15 minute', to_timestamp_seconds(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z') @@ -944,7 +944,7 @@ from (values query T SELECT arrow_typeof(DATE_BIN(INTERVAL '15 minute', to_timestamp_seconds(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z')) ---- -Timestamp(Second, None) +Timestamp(s) # month interval with INTERVAL keyword in date_bin with default start time query P @@ -1540,24 +1540,24 @@ from timestamp_utc; -- have to convert to utc prior to converting to berlin query PT select ts, arrow_typeof(ts) from timestamp_utc order by ts; ---- -2024-10-27T00:00:00Z Timestamp(Nanosecond, Some("UTC")) -2024-10-27T00:30:00Z Timestamp(Nanosecond, Some("UTC")) -2024-10-27T01:30:00Z Timestamp(Nanosecond, Some("UTC")) -2024-10-27T02:00:00Z Timestamp(Nanosecond, Some("UTC")) -2024-10-27T02:30:00Z Timestamp(Nanosecond, Some("UTC")) -2024-10-27T03:00:00Z Timestamp(Nanosecond, Some("UTC")) -2024-10-27T03:30:00Z Timestamp(Nanosecond, Some("UTC")) +2024-10-27T00:00:00Z Timestamp(ns, "UTC") +2024-10-27T00:30:00Z Timestamp(ns, "UTC") +2024-10-27T01:30:00Z Timestamp(ns, "UTC") +2024-10-27T02:00:00Z Timestamp(ns, "UTC") +2024-10-27T02:30:00Z Timestamp(ns, "UTC") +2024-10-27T03:00:00Z Timestamp(ns, "UTC") +2024-10-27T03:30:00Z Timestamp(ns, "UTC") query PT select ts, arrow_typeof(ts) from timestamp_berlin order by ts; ---- -2024-10-27T02:00:00+02:00 Timestamp(Nanosecond, Some("Europe/Berlin")) -2024-10-27T02:30:00+02:00 Timestamp(Nanosecond, Some("Europe/Berlin")) -2024-10-27T02:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) -2024-10-27T03:00:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) -2024-10-27T03:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) -2024-10-27T04:00:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) -2024-10-27T04:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) +2024-10-27T02:00:00+02:00 Timestamp(ns, "Europe/Berlin") +2024-10-27T02:30:00+02:00 Timestamp(ns, "Europe/Berlin") +2024-10-27T02:30:00+01:00 Timestamp(ns, "Europe/Berlin") +2024-10-27T03:00:00+01:00 Timestamp(ns, "Europe/Berlin") +2024-10-27T03:30:00+01:00 Timestamp(ns, "Europe/Berlin") +2024-10-27T04:00:00+01:00 Timestamp(ns, "Europe/Berlin") +2024-10-27T04:30:00+01:00 Timestamp(ns, "Europe/Berlin") # date trunc in utc with DST query PPPP @@ -1624,24 +1624,24 @@ from timestamp_utc; -- have to convert to utc prior to converting to Sau Paulo query PT select ts, arrow_typeof(ts) from timestamp_utc order by ts; ---- -2018-11-04T01:00:00Z Timestamp(Nanosecond, Some("UTC")) -2018-11-04T01:30:00Z Timestamp(Nanosecond, Some("UTC")) -2018-11-04T02:30:00Z Timestamp(Nanosecond, Some("UTC")) -2018-11-04T03:00:00Z Timestamp(Nanosecond, Some("UTC")) -2018-11-04T03:30:00Z Timestamp(Nanosecond, Some("UTC")) -2018-11-04T04:00:00Z Timestamp(Nanosecond, Some("UTC")) -2018-11-04T04:30:00Z Timestamp(Nanosecond, Some("UTC")) +2018-11-04T01:00:00Z Timestamp(ns, "UTC") +2018-11-04T01:30:00Z Timestamp(ns, "UTC") +2018-11-04T02:30:00Z Timestamp(ns, "UTC") +2018-11-04T03:00:00Z Timestamp(ns, "UTC") +2018-11-04T03:30:00Z Timestamp(ns, "UTC") +2018-11-04T04:00:00Z Timestamp(ns, "UTC") +2018-11-04T04:30:00Z Timestamp(ns, "UTC") query PT select ts, arrow_typeof(ts) from timestamp_sao_paulo order by ts; ---- -2018-11-03T22:00:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) -2018-11-03T22:30:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) -2018-11-03T23:30:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) -2018-11-04T01:00:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) -2018-11-04T01:30:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) -2018-11-04T02:00:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) -2018-11-04T02:30:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) +2018-11-03T22:00:00-03:00 Timestamp(ns, "America/Sao_Paulo") +2018-11-03T22:30:00-03:00 Timestamp(ns, "America/Sao_Paulo") +2018-11-03T23:30:00-03:00 Timestamp(ns, "America/Sao_Paulo") +2018-11-04T01:00:00-02:00 Timestamp(ns, "America/Sao_Paulo") +2018-11-04T01:30:00-02:00 Timestamp(ns, "America/Sao_Paulo") +2018-11-04T02:00:00-02:00 Timestamp(ns, "America/Sao_Paulo") +2018-11-04T02:30:00-02:00 Timestamp(ns, "America/Sao_Paulo") # date trunc in utc with DST query PPPP @@ -1797,7 +1797,7 @@ SELECT ts1 + i FROM foo; 2003-07-12T01:31:15.000123463 # Timestamp + Timestamp => error -query error DataFusion error: Error during planning: Cannot get result type for temporal operation Timestamp\(Nanosecond, None\) \+ Timestamp\(Nanosecond, None\): Invalid argument error: Invalid timestamp arithmetic operation: Timestamp\(Nanosecond, None\) \+ Timestamp\(Nanosecond, None\) +query error DataFusion error: Error during planning: Cannot get result type for temporal operation Timestamp\(ns\) \+ Timestamp\(ns\): Invalid argument error: Invalid timestamp arithmetic operation: Timestamp\(ns\) \+ Timestamp\(ns\) SELECT ts1 + ts2 FROM foo; @@ -2256,7 +2256,7 @@ SET TIME ZONE = '+00' query T SELECT arrow_typeof(time) FROM foo LIMIT 1 ---- -Timestamp(Nanosecond, Some("+05:00")) +Timestamp(ns, "+05:00") # check date_trunc query P @@ -2271,27 +2271,27 @@ SELECT date_trunc('day', time) FROM foo query T SELECT arrow_typeof(date_trunc('day', time)) FROM foo LIMIT 1 ---- -Timestamp(Nanosecond, Some("+05:00")) +Timestamp(ns, "+05:00") query T select arrow_typeof(date_trunc('minute', to_timestamp_seconds(61))) ---- -Timestamp(Second, None) +Timestamp(s) query T select arrow_typeof(date_trunc('second', to_timestamp_millis(61))) ---- -Timestamp(Millisecond, None) +Timestamp(ms) query T select arrow_typeof(date_trunc('millisecond', to_timestamp_micros(61))) ---- -Timestamp(Microsecond, None) +Timestamp(µs) query T select arrow_typeof(date_trunc('microsecond', to_timestamp(61))) ---- -Timestamp(Nanosecond, None) +Timestamp(ns) # check date_bin query P @@ -2306,7 +2306,7 @@ SELECT date_bin(INTERVAL '1 day', time, '1970-01-01T00:00:00+05:00') FROM foo query T SELECT arrow_typeof(date_bin(INTERVAL '1 day', time, '1970-01-01T00:00:00+05:00')) FROM foo LIMIT 1 ---- -Timestamp(Nanosecond, Some("+05:00")) +Timestamp(ns, "+05:00") # timestamp comparison with and without timezone @@ -2348,7 +2348,7 @@ true true true true true true true true true true true true true query TTT SELECT arrow_typeof(to_timestamp(1)), arrow_typeof(to_timestamp(null)), arrow_typeof(to_timestamp('2023-01-10 12:34:56.000')) ---- -Timestamp(Nanosecond, None) Timestamp(Nanosecond, None) Timestamp(Nanosecond, None) +Timestamp(ns) Timestamp(ns) Timestamp(ns) # verify timestamp output types using timestamp literal syntax query BBBBBB @@ -2384,7 +2384,7 @@ NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01 1969-12-31T23:5 query TTT SELECT arrow_typeof(to_timestamp(1, '%c', '%s')), arrow_typeof(to_timestamp(null, '%+', '%s')), arrow_typeof(to_timestamp('2023-01-10 12:34:56.000', '%Y-%m-%d %H:%M:%S%.f')) ---- -Timestamp(Nanosecond, None) Timestamp(Nanosecond, None) Timestamp(Nanosecond, None) +Timestamp(ns) Timestamp(ns) Timestamp(ns) # to_timestamp with invalid formatting query error input contains invalid characters @@ -2690,8 +2690,8 @@ SELECT t1.ts, t1.ts + INTERVAL '1' SECOND FROM t1; query PT SELECT t1.ts::timestamptz, arrow_typeof(t1.ts::timestamptz) FROM t1; ---- -2018-07-01T06:00:00Z Timestamp(Nanosecond, Some("+00")) -2018-07-01T07:00:00Z Timestamp(Nanosecond, Some("+00")) +2018-07-01T06:00:00Z Timestamp(ns, "+00") +2018-07-01T07:00:00Z Timestamp(ns, "+00") query D SELECT 0::TIME @@ -3281,7 +3281,7 @@ from ( select '2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels' as time ); ---- -2024-04-01T00:00:20+02:00 Timestamp(Nanosecond, Some("Europe/Brussels")) 2024-04-01T00:00:20 Timestamp(Nanosecond, None) +2024-04-01T00:00:20+02:00 Timestamp(ns, "Europe/Brussels") 2024-04-01T00:00:20 Timestamp(ns) # use to_local_time() in date_bin() query P @@ -3326,53 +3326,53 @@ from t; query PPT select column1, to_local_time(column1::timestamp), arrow_typeof(to_local_time(column1::timestamp)) from t_utc; ---- -NULL NULL Timestamp(Nanosecond, None) -2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(Nanosecond, None) -2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(Nanosecond, None) -2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(Nanosecond, None) -2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(Nanosecond, None) -2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(Nanosecond, None) -2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(Nanosecond, None) -2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(Nanosecond, None) -2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(Nanosecond, None) -2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(Nanosecond, None) -2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(Nanosecond, None) -2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(Nanosecond, None) -2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(Nanosecond, None) +NULL NULL Timestamp(ns) +2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(ns) +2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(ns) +2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(ns) +2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(ns) +2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(ns) +2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(ns) +2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(ns) +2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(ns) +2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(ns) +2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(ns) +2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(ns) +2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(ns) query PPT select column1, to_local_time(column1), arrow_typeof(to_local_time(column1)) from t_utc; ---- -NULL NULL Timestamp(Nanosecond, None) -2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(Nanosecond, None) -2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(Nanosecond, None) -2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(Nanosecond, None) -2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(Nanosecond, None) -2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(Nanosecond, None) -2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(Nanosecond, None) -2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(Nanosecond, None) -2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(Nanosecond, None) -2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(Nanosecond, None) -2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(Nanosecond, None) -2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(Nanosecond, None) -2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(Nanosecond, None) +NULL NULL Timestamp(ns) +2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(ns) +2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(ns) +2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(ns) +2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(ns) +2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(ns) +2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(ns) +2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(ns) +2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(ns) +2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(ns) +2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(ns) +2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(ns) +2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(ns) query PPT select column1, to_local_time(column1), arrow_typeof(to_local_time(column1)) from t_timezone; ---- -NULL NULL Timestamp(Nanosecond, None) -2024-01-01T00:00:01+01:00 2024-01-01T00:00:01 Timestamp(Nanosecond, None) -2024-02-01T00:00:01+01:00 2024-02-01T00:00:01 Timestamp(Nanosecond, None) -2024-03-01T00:00:01+01:00 2024-03-01T00:00:01 Timestamp(Nanosecond, None) -2024-04-01T00:00:01+02:00 2024-04-01T00:00:01 Timestamp(Nanosecond, None) -2024-05-01T00:00:01+02:00 2024-05-01T00:00:01 Timestamp(Nanosecond, None) -2024-06-01T00:00:01+02:00 2024-06-01T00:00:01 Timestamp(Nanosecond, None) -2024-07-01T00:00:01+02:00 2024-07-01T00:00:01 Timestamp(Nanosecond, None) -2024-08-01T00:00:01+02:00 2024-08-01T00:00:01 Timestamp(Nanosecond, None) -2024-09-01T00:00:01+02:00 2024-09-01T00:00:01 Timestamp(Nanosecond, None) -2024-10-01T00:00:01+02:00 2024-10-01T00:00:01 Timestamp(Nanosecond, None) -2024-11-01T00:00:01+01:00 2024-11-01T00:00:01 Timestamp(Nanosecond, None) -2024-12-01T00:00:01+01:00 2024-12-01T00:00:01 Timestamp(Nanosecond, None) +NULL NULL Timestamp(ns) +2024-01-01T00:00:01+01:00 2024-01-01T00:00:01 Timestamp(ns) +2024-02-01T00:00:01+01:00 2024-02-01T00:00:01 Timestamp(ns) +2024-03-01T00:00:01+01:00 2024-03-01T00:00:01 Timestamp(ns) +2024-04-01T00:00:01+02:00 2024-04-01T00:00:01 Timestamp(ns) +2024-05-01T00:00:01+02:00 2024-05-01T00:00:01 Timestamp(ns) +2024-06-01T00:00:01+02:00 2024-06-01T00:00:01 Timestamp(ns) +2024-07-01T00:00:01+02:00 2024-07-01T00:00:01 Timestamp(ns) +2024-08-01T00:00:01+02:00 2024-08-01T00:00:01 Timestamp(ns) +2024-09-01T00:00:01+02:00 2024-09-01T00:00:01 Timestamp(ns) +2024-10-01T00:00:01+02:00 2024-10-01T00:00:01 Timestamp(ns) +2024-11-01T00:00:01+01:00 2024-11-01T00:00:01 Timestamp(ns) +2024-12-01T00:00:01+01:00 2024-12-01T00:00:01 Timestamp(ns) # combine to_local_time() with date_bin() query P @@ -3667,7 +3667,7 @@ SELECT arrow_cast(a, 'LargeUtf8') FROM (SELECT TIMESTAMP '2005-09-10 13:31:00' AS a) ---- -Timestamp(Nanosecond, None) 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 +Timestamp(ns) 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 query TTTTT SELECT @@ -3678,4 +3678,4 @@ SELECT arrow_cast(a, 'LargeUtf8') FROM (SELECT CAST('2005-09-10 13:31:00 +02:00' AS timestamp with time zone) AS a) ---- -Timestamp(Nanosecond, Some("+00")) 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z +Timestamp(ns, "+00") 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z diff --git a/datafusion/sqllogictest/test_files/type_coercion.slt b/datafusion/sqllogictest/test_files/type_coercion.slt index 3175a0646b79..e3baa8fedcf6 100644 --- a/datafusion/sqllogictest/test_files/type_coercion.slt +++ b/datafusion/sqllogictest/test_files/type_coercion.slt @@ -47,7 +47,7 @@ query error DataFusion error: Error during planning: Cannot coerce arithmetic ex select interval '1 month' - '2023-05-01'::date; # interval - timestamp -query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types +query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(ns\) to valid types SELECT interval '1 month' - '2023-05-01 12:30:00'::timestamp; # dictionary(int32, utf8) -> utf8 diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index 1f7605d220c5..75db459b1881 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -521,7 +521,7 @@ physical_plan 16)----ProjectionExec: expr=[1 as cnt] 17)------PlaceholderRowExec 18)----ProjectionExec: expr=[lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as cnt] -19)------BoundedWindowAggExec: wdw=[lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +19)------BoundedWindowAggExec: wdw=[lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 20)--------ProjectionExec: expr=[1 as c1] 21)----------PlaceholderRowExec diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 38fcc1ba9016..50121813133b 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -863,11 +863,11 @@ select count(*) from (select unnest(range(0, 100000)) id) t inner join (select u # Test implicit LATERAL support for UNNEST # Issue: https://github.com/apache/datafusion/issues/13659 # TODO: https://github.com/apache/datafusion/issues/10048 -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "column1", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}, Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "column1", data_type: List\(Field \{ data_type: Int64, nullable: true \}\), nullable: true \}, Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\) select * from unnest_table u, unnest(u.column1); # Test implicit LATERAL support for UNNEST (INNER JOIN) -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "column1", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}, Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "column1", data_type: List\(Field \{ data_type: Int64, nullable: true \}\), nullable: true \}, Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\) select * from unnest_table u INNER JOIN unnest(u.column1) AS t(column1) ON u.column3 = t.column1; # Test implicit LATERAL planning for UNNEST @@ -883,7 +883,7 @@ logical_plan 06)------Unnest: lists[__unnest_placeholder(outer_ref(u.column1))|depth=1] structs[] 07)--------Projection: outer_ref(u.column1) AS __unnest_placeholder(outer_ref(u.column1)) 08)----------EmptyRelation: rows=1 -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "column1", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "u" }), name: "column1" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "column1", data_type: List(Field { data_type: Int64, nullable: true }), nullable: true }, Column { relation: Some(Bare { table: "u" }), name: "column1" }) # Test implicit LATERAL planning for UNNEST (INNER JOIN) query TT @@ -899,7 +899,7 @@ logical_plan 07)--------Unnest: lists[__unnest_placeholder(outer_ref(u.column1))|depth=1] structs[] 08)----------Projection: outer_ref(u.column1) AS __unnest_placeholder(outer_ref(u.column1)) 09)------------EmptyRelation: rows=1 -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "column1", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "u" }), name: "column1" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "column1", data_type: List(Field { data_type: Int64, nullable: true }), nullable: true }, Column { relation: Some(Bare { table: "u" }), name: "column1" }) # uncorrelated EXISTS with unnest query I @@ -969,7 +969,7 @@ physical_plan 08)--------------UnnestExec 09)----------------ProjectionExec: expr=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as generated_id, make_array(value@0) as __unnest_placeholder(make_array(range().value))] 10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -11)--------------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +11)--------------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 12)----------------------LazyMemoryExec: partitions=1, batch_generators=[range: start=1, end=5, batch_size=8192] # Unnest array where data is already ordered by column2 (100, 200, 300, 400) diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index f1a708d84dd3..d9b4a818f99e 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -360,7 +360,7 @@ physical_plan 02)--ProjectionExec: expr=[b@0 as b, max(d.a)@1 as max_a, max(d.seq)@2 as max(d.seq)] 03)----AggregateExec: mode=SinglePartitioned, gby=[b@2 as b], aggr=[max(d.a), max(d.seq)], ordering_mode=Sorted 04)------ProjectionExec: expr=[row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as seq, a@0 as a, b@1 as b] -05)--------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortExec: expr=[b@1 ASC NULLS LAST, a@0 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=8192 08)--------------RepartitionExec: partitioning=Hash([b@1], 4), input_partitions=4 @@ -1241,9 +1241,9 @@ logical_plan 05)--------TableScan: aggregate_test_100 projection=[c8, c9] physical_plan 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as sum2] -02)--BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----ProjectionExec: expr=[c9@1 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c9@1 ASC NULLS LAST, c8@0 ASC NULLS LAST], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c8, c9], file_type=csv, has_header=true @@ -1262,9 +1262,9 @@ logical_plan 05)--------TableScan: aggregate_test_100 projection=[c2, c9] physical_plan 01)ProjectionExec: expr=[c2@0 as c2, max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@4 as sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -02)--WindowAggExec: wdw=[sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] -03)----BoundedWindowAggExec: wdw=[max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--WindowAggExec: wdw=[sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +03)----BoundedWindowAggExec: wdw=[max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c2@0 ASC NULLS LAST, c9@1 ASC NULLS LAST], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c9], file_type=csv, has_header=true @@ -1286,10 +1286,10 @@ logical_plan physical_plan 01)SortExec: expr=[c2@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[c2@0 as c2, max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@4 as sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] -04)------BoundedWindowAggExec: wdw=[max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +04)------BoundedWindowAggExec: wdw=[max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c9@1 ASC NULLS LAST, c2@0 ASC NULLS LAST], preserve_partitioning=[false] -06)----------BoundedWindowAggExec: wdw=[min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 07)------------SortExec: expr=[c2@0 ASC NULLS LAST, c9@1 ASC NULLS LAST], preserve_partitioning=[false] 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c9], file_type=csv, has_header=true @@ -1311,12 +1311,12 @@ logical_plan 05)--------TableScan: aggregate_test_100 projection=[c1, c2, c4] physical_plan 01)ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@2 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING] -02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 03)----SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 04)------CoalesceBatchesExec: target_batch_size=4096 05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 06)----------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING] -07)------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +07)------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 08)--------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 09)----------------CoalesceBatchesExec: target_batch_size=4096 10)------------------RepartitionExec: partitioning=Hash([c1@0, c2@1], 2), input_partitions=2 @@ -1343,8 +1343,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as sum2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 05)--------SortExec: TopK(fetch=10), expr=[c9@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -1386,8 +1386,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as sum2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 05)--------SortExec: TopK(fetch=10), expr=[c9@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -1446,8 +1446,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as sum2] 02)--GlobalLimitExec: skip=5, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 05)--------SortExec: TopK(fetch=15), expr=[c9@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -1488,8 +1488,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as fv1, first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as fv2, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as lag1, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@2 as lag2, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as lead1, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@3 as lead2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 05)--------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -1531,9 +1531,9 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as rn1, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as rn2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 04)------SortExec: TopK(fetch=10), expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false] -05)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 06)----------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -1573,10 +1573,10 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@2 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as rn2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 04)------SortExec: TopK(fetch=10), expr=[c9@2 ASC NULLS LAST, c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[false] -05)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] -06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 07)------------SortExec: expr=[c9@2 DESC, c1@0 DESC], preserve_partitioning=[false] 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c9], file_type=csv, has_header=true @@ -1655,19 +1655,19 @@ logical_plan physical_plan 01)ProjectionExec: expr=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@18 as a, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@18 as b, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@3 as c, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@11 as d, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@7 as e, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@3 as f, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@11 as g, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@19 as h, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as i, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@12 as j, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as k, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@8 as l, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@17 as m, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@15 as n, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as o, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@16 as p, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@20 as a1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@20 as b1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@5 as c1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@13 as d1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@9 as e1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@5 as f1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@13 as g1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@19 as h1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@19 as j1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as k1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@12 as l1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@8 as m1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as n1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@12 as o1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@21 as h11, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@21 as j11, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@6 as k11, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@14 as l11, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@10 as m11, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@6 as n11, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@14 as o11] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }] +03)----WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }] 04)------ProjectionExec: expr=[c1@0 as c1, c3@2 as c3, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@4 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@6 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@7 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@8 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@9 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@10 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@11 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@12 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@13 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@14 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@15 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@16 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@17 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@18 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -05)--------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortExec: expr=[c3@2 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[false] -07)------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +07)------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 08)--------------SortExec: expr=[c3@2 ASC NULLS LAST, c1@0 ASC], preserve_partitioning=[false] -09)----------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +09)----------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 10)------------------SortExec: expr=[c3@2 ASC NULLS LAST, c1@0 DESC], preserve_partitioning=[false] -11)--------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(10)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(NULL)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }] -12)----------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }] +11)--------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(10)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(NULL)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }] +12)----------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }] 13)------------------------SortExec: expr=[c3@2 DESC NULLS LAST], preserve_partitioning=[false] -14)--------------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }] -15)----------------------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +14)--------------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }] +15)----------------------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 16)------------------------------SortExec: expr=[c3@2 DESC, c1@0 ASC NULLS LAST], preserve_partitioning=[false] 17)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/null_cases.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true @@ -1741,8 +1741,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@1 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 05)--------SortExec: TopK(fetch=10), expr=[c1@0 ASC NULLS LAST, c9@1 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], file_type=csv, has_header=true @@ -1785,8 +1785,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@1 as c9, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 05)--------SortExec: TopK(fetch=10), expr=[c1@0 ASC NULLS LAST, c9@1 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], file_type=csv, has_header=true @@ -1831,9 +1831,9 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c3@1 as c3, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)), is_causal: false }] +03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)), is_causal: false }] 04)------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, c3@2 as c3, c9@3 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortPreservingMergeExec: [__common_expr_1@0 DESC, c9@3 DESC, c2@1 ASC NULLS LAST] 07)------------SortExec: expr=[__common_expr_1@0 DESC, c9@3 DESC, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 08)--------------ProjectionExec: expr=[c3@1 + c4@2 as __common_expr_1, c2@0 as c2, c3@1 as c3, c9@3 as c9] @@ -1926,13 +1926,13 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c3@0 ASC NULLS LAST], fetch=5 02)--ProjectionExec: expr=[c3@0 as c3, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum2] -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c3@0 ASC NULLS LAST, c9@1 DESC], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=4096 06)----------RepartitionExec: partitioning=Hash([c3@0], 2), input_partitions=2 07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 08)--------------ProjectionExec: expr=[c3@1 as c3, c9@2 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -09)----------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +09)----------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 10)------------------SortExec: expr=[c3@1 DESC, c9@2 DESC, c2@0 ASC NULLS LAST], preserve_partitioning=[false] 11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3, c9], file_type=csv, has_header=true @@ -1968,7 +1968,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST] 02)--ProjectionExec: expr=[c1@0 as c1, row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as rn1] -03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 04)------SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=4096 06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -2097,7 +2097,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, rn1@1 ASC NULLS LAST] 02)--ProjectionExec: expr=[c1@0 as c1, row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as rn1] -03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 04)------SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=4096 06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -2123,10 +2123,10 @@ logical_plan physical_plan 01)SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[c1@0 as c1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2] -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 04)------SortPreservingMergeExec: [c9@1 ASC NULLS LAST] 05)--------SortExec: expr=[c9@1 ASC NULLS LAST], preserve_partitioning=[true] -06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING], mode=[Sorted] 07)------------SortExec: expr=[c1@0 ASC NULLS LAST, c9@1 ASC NULLS LAST], preserve_partitioning=[true] 08)--------------CoalesceBatchesExec: target_batch_size=4096 09)----------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -2211,11 +2211,11 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=5), expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[c9@2 as c9, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@6 as sum2, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum3, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@5 as sum4] -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 04)------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c9@3 as c9, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@4 as sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@6 as sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING] -05)--------WindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] -06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] -07)------------WindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] +05)--------WindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] +06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +07)------------WindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] 08)--------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, c9@3 ASC NULLS LAST, c8@2 ASC NULLS LAST], preserve_partitioning=[false] 09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c8, c9], file_type=csv, has_header=true @@ -2266,12 +2266,12 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@1 as c9, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as sum1, sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@6 as sum2, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum3, sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@5 as sum4] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 04)------ProjectionExec: expr=[c2@0 as c2, c9@2 as c9, c1_alias@3 as c1_alias, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@4 as sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@6 as sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING] -05)--------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] +05)--------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] 06)----------ProjectionExec: expr=[c2@1 as c2, c8@2 as c8, c9@3 as c9, c1_alias@4 as c1_alias, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@5 as sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@6 as sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING] -07)------------BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] -08)--------------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] +07)------------BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +08)--------------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] 09)----------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, c9@3 ASC NULLS LAST, c8@2 ASC NULLS LAST], preserve_partitioning=[false] 10)------------------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c8@2 as c8, c9@3 as c9, c1@0 as c1_alias] 11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c8, c9], file_type=csv, has_header=true @@ -2312,9 +2312,9 @@ physical_plan 01)ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2] 02)--SortExec: TopK(fetch=5), expr=[c9@2 ASC NULLS LAST], preserve_partitioning=[false] 03)----ProjectionExec: expr=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum1, sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING@4 as sum2, c9@1 as c9] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING: Field { name: "sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING: Field { "sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING": nullable Float64 }, frame: GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING], mode=[Sorted] 05)--------ProjectionExec: expr=[c1@0 as c1, c9@2 as c9, c12@3 as c12, sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING] -06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Float64 }, frame: GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 07)------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[false] 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c9, c12], file_type=csv, has_header=true @@ -2348,7 +2348,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -2385,7 +2385,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -2422,7 +2422,7 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=5), expr=[rn1@1 DESC], preserve_partitioning=[false] 02)--ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1] -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -2462,7 +2462,7 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=5), expr=[rn1@1 ASC NULLS LAST, c9@0 ASC NULLS LAST], preserve_partitioning=[false], sort_prefix=[rn1@1 ASC NULLS LAST] 02)--ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1] -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -2537,7 +2537,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -2559,7 +2559,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c5@0 as c5, c9@1 as c9, row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rn1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[CAST(c9@1 AS Decimal128(20, 0)) + CAST(c5@0 AS Decimal128(20, 0)) DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5, c9], file_type=csv, has_header=true @@ -2580,7 +2580,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, CAST(row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 AS Int64) as rn1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -2685,10 +2685,10 @@ physical_plan 01)ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2, sum3@2 as sum3, min1@3 as min1, min2@4 as min2, min3@5 as min3, max1@6 as max1, max2@7 as max2, max3@8 as max3, cnt1@9 as cnt1, cnt2@10 as cnt2, sumr1@11 as sumr1, sumr2@12 as sumr2, sumr3@13 as sumr3, minr1@14 as minr1, minr2@15 as minr2, minr3@16 as minr3, maxr1@17 as maxr1, maxr2@18 as maxr2, maxr3@19 as maxr3, cntr1@20 as cntr1, cntr2@21 as cntr2, sum4@22 as sum4, cnt3@23 as cnt3] 02)--SortExec: TopK(fetch=5), expr=[inc_col@24 DESC], preserve_partitioning=[false] 03)----ProjectionExec: expr=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@13 as sum1, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@14 as sum2, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@15 as sum3, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@16 as min1, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@17 as min2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as min3, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as max1, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@20 as max2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@21 as max3, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING@22 as cnt1, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@23 as cnt2, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING@2 as sumr1, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING@3 as sumr2, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as sumr3, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@5 as minr1, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@6 as minr2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@7 as minr3, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@8 as maxr1, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@9 as maxr2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@10 as maxr3, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@11 as cntr1, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@12 as cntr2, sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@24 as sum4, count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@25 as cnt3, inc_col@1 as inc_col] -04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 05)--------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, inc_col@3 as inc_col, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING@5 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING@6 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@7 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@8 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@9 as min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@10 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@12 as max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@13 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@14 as count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@15 as count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@16 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@17 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@20 as min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@21 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@22 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@23 as max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING@25 as count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@26 as count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING] -06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { name: "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { name: "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING], mode=[Sorted] -07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING: Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 4 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING: Field { name: "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { name: "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { name: "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 2 PRECEDING AND 6 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 8 FOLLOWING], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING": Int64 }, frame: RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 4 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 2 PRECEDING AND 6 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 8 FOLLOWING], mode=[Sorted] 08)--------------ProjectionExec: expr=[CAST(desc_col@2 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Int64) as __common_expr_2, ts@0 as ts, inc_col@1 as inc_col, desc_col@2 as desc_col] 09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col, desc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true @@ -2771,8 +2771,8 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=5), expr=[ts@0 DESC], preserve_partitioning=[false] 02)--ProjectionExec: expr=[ts@0 as ts, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@10 as fv1, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as fv2, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@12 as lv1, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@13 as lv2, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@14 as nv1, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@15 as nv2, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@16 as rn1, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@17 as rn2, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as rank1, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as rank2, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@20 as dense_rank1, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@21 as dense_rank2, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@22 as lag1, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@23 as lag2, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as lead1, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@25 as lead2, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@2 as fvr1, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@3 as fvr2, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@4 as lvr1, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@5 as lvr2, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@6 as lagr1, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@7 as lagr2, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@8 as leadr1, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@9 as leadr2] -03)----BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": UInt64 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": UInt64 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": UInt64 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING], mode=[Sorted] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true query IIIIIIIIIIIIIIIIIIIIIIIII @@ -2843,8 +2843,8 @@ physical_plan 01)ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2, min1@2 as min1, min2@3 as min2, max1@4 as max1, max2@5 as max2, count1@6 as count1, count2@7 as count2, avg1@8 as avg1, avg2@9 as avg2] 02)--SortExec: TopK(fetch=5), expr=[inc_col@10 ASC NULLS LAST], preserve_partitioning=[false] 03)----ProjectionExec: expr=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@9 as sum1, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as sum2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@10 as min1, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@5 as min2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@11 as max1, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@6 as max2, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@12 as count1, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@7 as count2, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@13 as avg1, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@8 as avg2, inc_col@3 as inc_col] -04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { name: "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { name: "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING], mode=[Sorted] -05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] 06)----------ProjectionExec: expr=[CAST(inc_col@1 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Float64) as __common_expr_2, ts@0 as ts, inc_col@1 as inc_col] 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true @@ -2895,8 +2895,8 @@ physical_plan 01)ProjectionExec: expr=[first_value1@0 as first_value1, first_value2@1 as first_value2, last_value1@2 as last_value1, last_value2@3 as last_value2, nth_value1@4 as nth_value1] 02)--SortExec: TopK(fetch=5), expr=[inc_col@5 ASC NULLS LAST], preserve_partitioning=[false] 03)----ProjectionExec: expr=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@4 as first_value1, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@2 as first_value2, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@5 as last_value1, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as last_value2, nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@6 as nth_value1, inc_col@1 as inc_col] -04)------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted] -05)--------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true query IIIII @@ -2939,8 +2939,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@5 as sum1, sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum2, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@6 as count1, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as count2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] 05)--------ProjectionExec: expr=[CAST(inc_col@1 AS Int64) as __common_expr_1, ts@0 as ts, inc_col@1 as inc_col] 06)----------StreamingTableExec: partition_sizes=1, projection=[ts, inc_col], infinite_source=true, output_ordering=[ts@0 ASC NULLS LAST] @@ -2984,8 +2984,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@5 as sum1, sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum2, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@6 as count1, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as count2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] 05)--------ProjectionExec: expr=[CAST(inc_col@1 AS Int64) as __common_expr_1, ts@0 as ts, inc_col@1 as inc_col] 06)----------StreamingTableExec: partition_sizes=1, projection=[ts, inc_col], infinite_source=true, output_ordering=[ts@0 ASC NULLS LAST] @@ -3084,12 +3084,12 @@ logical_plan physical_plan 01)ProjectionExec: expr=[a@1 as a, b@2 as b, c@3 as c, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as sum1, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as sum2, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@15 as sum3, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING@16 as sum4, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as sum5, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as sum6, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@11 as sum7, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@12 as sum8, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as sum9, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as sum10, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@13 as sum11, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING@14 as sum12] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING], mode=[Linear] -04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[PartiallySorted([1, 0])] -05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] -06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING], mode=[PartiallySorted([0])] -07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0, 1])] -08)--------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING], mode=[Linear] +04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[PartiallySorted([1, 0])] +05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING], mode=[PartiallySorted([0])] +07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0, 1])] +08)--------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 09)----------------ProjectionExec: expr=[CAST(c@2 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d] 10)------------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST] @@ -3152,17 +3152,17 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=5), expr=[c@2 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[a@1 as a, b@2 as b, c@3 as c, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as sum1, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as sum2, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@15 as sum3, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING@16 as sum4, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as sum5, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as sum6, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@11 as sum7, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@12 as sum8, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as sum9, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as sum10, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@13 as sum11, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING@14 as sum12] -03)----BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING], mode=[Sorted] 04)------SortExec: expr=[d@4 ASC NULLS LAST, a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false] -05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[Sorted] 06)----------SortExec: expr=[b@2 ASC NULLS LAST, a@1 ASC NULLS LAST, d@4 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false] -07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 08)--------------SortExec: expr=[b@2 ASC NULLS LAST, a@1 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false] -09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING], mode=[Sorted] +09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING], mode=[Sorted] 10)------------------SortExec: expr=[a@1 ASC NULLS LAST, d@4 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false] -11)--------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted] +11)--------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted] 12)----------------------SortExec: expr=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, d@4 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false] -13)------------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +13)------------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 14)--------------------------ProjectionExec: expr=[CAST(c@2 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d] 15)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true @@ -3226,7 +3226,7 @@ physical_plan 01)ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as rn1] 02)--CoalesceBatchesExec: target_batch_size=4096, fetch=5 03)----FilterExec: row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 < 50 -04)------BoundedWindowAggExec: wdw=[row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST] # Top level sort is pushed down through BoundedWindowAggExec as its SUM result does already satisfy the required @@ -3248,7 +3248,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as sum1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -3333,11 +3333,11 @@ logical_plan 08)--------------TableScan: annotated_data_infinite2 projection=[a, b, c, d] physical_plan 01)ProjectionExec: expr=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum2, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum3, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum4] -02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Linear] +02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Linear] 03)----ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, a@1 as a, d@4 as d, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] -05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0])] -06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0])] +06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 07)------------ProjectionExec: expr=[CAST(a@0 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d] 08)--------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST] @@ -3364,17 +3364,17 @@ logical_plan 08)--------------TableScan: annotated_data_infinite2 projection=[a, b, c, d] physical_plan 01)ProjectionExec: expr=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum2, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum3, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum4] -02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Linear] +02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Linear] 03)----CoalesceBatchesExec: target_batch_size=4096 04)------RepartitionExec: partitioning=Hash([d@2], 2), input_partitions=2, preserve_order=true, sort_exprs=__common_expr_1@0 ASC NULLS LAST, a@1 ASC NULLS LAST 05)--------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, a@1 as a, d@4 as d, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 07)------------CoalesceBatchesExec: target_batch_size=4096 08)--------------RepartitionExec: partitioning=Hash([b@2, a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST -09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0])] +09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0])] 10)------------------CoalesceBatchesExec: target_batch_size=4096 11)--------------------RepartitionExec: partitioning=Hash([a@1, d@4], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST -12)----------------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +12)----------------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 13)------------------------CoalesceBatchesExec: target_batch_size=4096 14)--------------------------RepartitionExec: partitioning=Hash([a@1, b@2], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST 15)----------------------------ProjectionExec: expr=[CAST(a@0 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d] @@ -3433,10 +3433,10 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=5), expr=[c3@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[c3@0 as c3, max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as min1, min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as max1] -03)----BoundedWindowAggExec: wdw=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c12@1 ASC NULLS LAST], preserve_partitioning=[false] 05)--------ProjectionExec: expr=[c3@0 as c3, c12@2 as c12, min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] -06)----------WindowAggExec: wdw=[min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +06)----------WindowAggExec: wdw=[min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 07)------------SortExec: expr=[c11@1 ASC NULLS LAST], preserve_partitioning=[false] 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c11, c12], file_type=csv, has_header=true @@ -3477,7 +3477,7 @@ physical_plan 01)ProjectionExec: expr=[min1@0 as min1, max1@1 as max1] 02)--SortExec: TopK(fetch=5), expr=[c3@2 ASC NULLS LAST], preserve_partitioning=[false] 03)----ProjectionExec: expr=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as min1, min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as max1, c3@0 as c3] -04)------BoundedWindowAggExec: wdw=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c12@1 ASC NULLS LAST], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c12], file_type=csv, has_header=true @@ -3529,7 +3529,7 @@ logical_plan 02)--Filter: multiple_ordered_table.b = Int32(0) 03)----TableScan: multiple_ordered_table projection=[a0, a, b, c, d], partial_filters=[multiple_ordered_table.b = Int32(0)] physical_plan -01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 02)--CoalesceBatchesExec: target_batch_size=4096 03)----FilterExec: b@2 = 0 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]], file_type=csv, has_header=true @@ -3547,7 +3547,7 @@ logical_plan 02)--Filter: multiple_ordered_table.b = Int32(0) 03)----TableScan: multiple_ordered_table projection=[a0, a, b, c, d], partial_filters=[multiple_ordered_table.b = Int32(0)] physical_plan -01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 02)--SortExec: expr=[d@4 ASC NULLS LAST], preserve_partitioning=[false] 03)----CoalesceBatchesExec: target_batch_size=4096 04)------FilterExec: b@2 = 0 @@ -3584,9 +3584,9 @@ logical_plan 05)--------TableScan: multiple_ordered_table projection=[a, b, c, d] physical_plan 01)ProjectionExec: expr=[min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as min1, max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as max1] -02)--BoundedWindowAggExec: wdw=[min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----ProjectionExec: expr=[c@2 as c, d@3 as d, max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -04)------BoundedWindowAggExec: wdw=[max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_orderings=[[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], [c@2 ASC NULLS LAST]], file_type=csv, has_header=true query TT @@ -3603,7 +3603,7 @@ logical_plan 04)------TableScan: multiple_ordered_table projection=[c, d], partial_filters=[multiple_ordered_table.d = Int32(0)] physical_plan 01)ProjectionExec: expr=[max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as max_c] -02)--BoundedWindowAggExec: wdw=[max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----CoalesceBatchesExec: target_batch_size=4096 04)------FilterExec: d@1 = 0 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true @@ -3618,7 +3618,7 @@ logical_plan 03)----TableScan: multiple_ordered_table projection=[a, c, d] physical_plan 01)ProjectionExec: expr=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -02)--BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c, d], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true query TT @@ -3631,7 +3631,7 @@ logical_plan 03)----TableScan: multiple_ordered_table projection=[a, b, c, d] physical_plan 01)ProjectionExec: expr=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -02)--BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_orderings=[[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], [c@2 ASC NULLS LAST]], file_type=csv, has_header=true query I @@ -3673,7 +3673,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c@0 as c, nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as nv1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----WindowAggExec: wdw=[nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int32(NULL)), is_causal: false }] +03)----WindowAggExec: wdw=[nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int32(NULL)), is_causal: false }] 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true query II @@ -3724,7 +3724,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c@3 ASC NULLS LAST] 02)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW@5 as avg_d] -03)----BoundedWindowAggExec: wdw=[avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW: Field { name: "avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND CURRENT ROW], mode=[Linear] +03)----BoundedWindowAggExec: wdw=[avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW: Field { "avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN 10 PRECEDING AND CURRENT ROW], mode=[Linear] 04)------CoalesceBatchesExec: target_batch_size=4096 05)--------RepartitionExec: partitioning=Hash([d@4], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST 06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -4059,7 +4059,7 @@ logical_plan 03)----TableScan: table_with_pk projection=[sn, ts, currency, amount] physical_plan 01)ProjectionExec: expr=[sn@0 as sn, ts@1 as ts, currency@2 as currency, amount@3 as amount, sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum1] -02)--BoundedWindowAggExec: wdw=[sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----SortExec: expr=[sn@0 ASC NULLS LAST], preserve_partitioning=[false] 04)------DataSourceExec: partitions=1, partition_sizes=[1] @@ -4178,9 +4178,9 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c3@0 as c3, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum2, sum1@3 as sum1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)), is_causal: false }] +03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)), is_causal: false }] 04)------ProjectionExec: expr=[c3@0 as c3, c4@1 as c4, c9@2 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1] -05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortExec: expr=[c3@0 + c4@1 DESC], preserve_partitioning=[false] 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c4, c9], file_type=csv, has_header=true @@ -4219,7 +4219,7 @@ logical_plan 04)------TableScan: a projection=[a] physical_plan 01)ProjectionExec: expr=[count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as count(*) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----CoalesceBatchesExec: target_batch_size=4096 04)------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -4242,7 +4242,7 @@ logical_plan 04)------TableScan: a projection=[a] physical_plan 01)ProjectionExec: expr=[row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] -02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 03)----CoalesceBatchesExec: target_batch_size=4096 04)------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -5311,7 +5311,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank@2 ASC NULLS LAST] 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank] -03)----BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=1 06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -5355,7 +5355,7 @@ physical_plan 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank] 03)----CoalesceBatchesExec: target_batch_size=1 04)------FilterExec: c2@1 >= 10 -05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=1 08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -5397,7 +5397,7 @@ physical_plan 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank] 03)----CoalesceBatchesExec: target_batch_size=1 04)------FilterExec: c2@1 = 10 -05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortExec: expr=[c2@1 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=1 08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -5438,7 +5438,7 @@ physical_plan 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank] 03)----CoalesceBatchesExec: target_batch_size=1 04)------FilterExec: c1@0 = 1 OR c2@1 = 10 -05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=1 08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -5481,11 +5481,11 @@ physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank1@2 ASC NULLS LAST, rank2@3 ASC NULLS LAST] 02)--SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank1@2 ASC NULLS LAST, rank2@3 ASC NULLS LAST], preserve_partitioning=[true] 03)----ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank1, rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rank2] -04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c2@1 ASC NULLS LAST, c1@0 ASC NULLS LAST], preserve_partitioning=[true] 06)----------CoalesceBatchesExec: target_batch_size=1 07)------------RepartitionExec: partitioning=Hash([c2@1, c1@0], 2), input_partitions=2 -08)--------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +08)--------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 09)----------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 10)------------------CoalesceBatchesExec: target_batch_size=1 11)--------------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -5532,13 +5532,13 @@ physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank1@2 ASC NULLS LAST, rank2@3 ASC NULLS LAST] 02)--SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank1@2 ASC NULLS LAST, rank2@3 ASC NULLS LAST], preserve_partitioning=[true] 03)----ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank1, rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rank2] -04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c2@1 ASC NULLS LAST, c1@0 ASC NULLS LAST], preserve_partitioning=[true] 06)----------CoalesceBatchesExec: target_batch_size=1 07)------------RepartitionExec: partitioning=Hash([c2@1, c1@0], 2), input_partitions=2 08)--------------CoalesceBatchesExec: target_batch_size=1 09)----------------FilterExec: c2@1 > 1 -10)------------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +10)------------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 11)--------------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 12)----------------------CoalesceBatchesExec: target_batch_size=1 13)------------------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -5599,7 +5599,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, sum_c9@1 ASC NULLS LAST] 02)--ProjectionExec: expr=[c1@0 as c1, sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as sum_c9] -03)----WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +03)----WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 04)------CoalesceBatchesExec: target_batch_size=1 05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2, preserve_order=true, sort_exprs=c1@0 ASC NULLS LAST 06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -5615,7 +5615,7 @@ logical_plan 04)------TableScan: aggregate_test_100_ordered projection=[c9] physical_plan 01)ProjectionExec: expr=[sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as sum_c9] -02)--WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +02)--WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -5630,7 +5630,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, min_c5@1 DESC NULLS LAST] 02)--ProjectionExec: expr=[c1@0 as c1, min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as min_c5] -03)----WindowAggExec: wdw=[min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +03)----WindowAggExec: wdw=[min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 04)------CoalesceBatchesExec: target_batch_size=1 05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2, preserve_order=true, sort_exprs=c1@0 ASC NULLS LAST 06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -5646,7 +5646,7 @@ logical_plan 04)------TableScan: aggregate_test_100_ordered projection=[c5] physical_plan 01)ProjectionExec: expr=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as max_c5] -02)--WindowAggExec: wdw=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +02)--WindowAggExec: wdw=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5], file_type=csv, has_header=true query II rowsort @@ -5829,7 +5829,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [k@0 ASC NULLS LAST, time@1 ASC NULLS LAST] 02)--ProjectionExec: expr=[k@0 as k, time@2 as time, count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@3 as normal_count, count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@4 as distinct_count] -03)----BoundedWindowAggExec: wdw=[count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { name: "count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW, count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { name: "count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW, count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[k@0 ASC NULLS LAST, time@2 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=1 06)----------RepartitionExec: partitioning=Hash([k@0], 2), input_partitions=2 @@ -5892,7 +5892,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [k@0 ASC NULLS LAST, time@1 ASC NULLS LAST] 02)--ProjectionExec: expr=[k@1 as k, time@2 as time, sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@3 as sum_v, sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@4 as sum_distinct_v] -03)----BoundedWindowAggExec: wdw=[sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { name: "sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW, sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { name: "sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW, sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[k@1 ASC NULLS LAST, time@2 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=1 06)----------RepartitionExec: partitioning=Hash([k@1], 2), input_partitions=2 @@ -5937,7 +5937,7 @@ LIMIT 5 ---- DataFusion error: type_coercion caused by -Error during planning: Cannot infer common argument type for comparison operation Int64 >= List(Field { name: "item", data_type: Null, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +Error during planning: Cannot infer common argument type for comparison operation Int64 >= List(nullable Null) @@ -5965,7 +5965,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c1@2 as c1, c2@3 as c2, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum1, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum2, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as count1, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as array_agg1, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@8 as array_agg2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable List(nullable Int64) }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable List(nullable Int64) }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortPreservingMergeExec: [c1@2 ASC NULLS LAST, c2@3 ASC NULLS LAST], fetch=5 05)--------SortExec: TopK(fetch=5), expr=[c1@2 ASC NULLS LAST, c2@3 ASC NULLS LAST], preserve_partitioning=[true] 06)----------ProjectionExec: expr=[__common_expr_3@0 as __common_expr_1, __common_expr_3@0 AND c2@2 < 4 AND c1@1 > 0 as __common_expr_2, c1@1 as c1, c2@2 as c2] diff --git a/datafusion/sqllogictest/test_files/window_limits.slt b/datafusion/sqllogictest/test_files/window_limits.slt index c1e680084f4b..883cd4404f4f 100644 --- a/datafusion/sqllogictest/test_files/window_limits.slt +++ b/datafusion/sqllogictest/test_files/window_limits.slt @@ -71,7 +71,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] 02)--GlobalLimitExec: skip=0, fetch=3 -03)----BoundedWindowAggExec: wdw=[lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=4), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true @@ -108,7 +108,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] 02)--GlobalLimitExec: skip=0, fetch=3 -03)----BoundedWindowAggExec: wdw=[lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=5), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true @@ -170,7 +170,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[empno@0 as empno, lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as lead1, lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as lead3, lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as lead5] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=10), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno, salary], file_type=csv, has_header=true @@ -207,7 +207,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] 02)--GlobalLimitExec: skip=0, fetch=3 -03)----BoundedWindowAggExec: wdw=[nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=3), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true @@ -244,7 +244,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] 02)--GlobalLimitExec: skip=0, fetch=3 -03)----BoundedWindowAggExec: wdw=[nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=3), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true @@ -309,7 +309,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[depname@0 as depname, empno@1 as empno, sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as running_sum, avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as running_avg, min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as running_min, max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as running_max] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=5), expr=[empno@1 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno, salary], file_type=csv, has_header=true @@ -371,7 +371,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[empno@0 as empno, row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn, rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rnk, dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as drnk] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=5), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true @@ -433,7 +433,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[empno@0 as empno, percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as pr, cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as cd, ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as nt] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----WindowAggExec: wdw=[percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }] +03)----WindowAggExec: wdw=[percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }] 04)------SortExec: expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true @@ -498,7 +498,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[empno@0 as empno, first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as fv, lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as l1, last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as lv, nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as n3] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=5), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno, salary], file_type=csv, has_header=true @@ -541,7 +541,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [depname@0 ASC NULLS LAST], fetch=5 02)--ProjectionExec: expr=[depname@0 as depname, empno@1 as empno, salary@2 as salary, sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW@3 as running_sum] -03)----BoundedWindowAggExec: wdw=[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW: Field { name: "sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW: Field { "sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=4 @@ -587,7 +587,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [depname@0 ASC NULLS LAST], fetch=5 02)--ProjectionExec: expr=[depname@0 as depname, empno@1 as empno, salary@2 as salary, sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW@3 as running_sum] -03)----BoundedWindowAggExec: wdw=[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW: Field { name: "sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW: Field { "sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=5), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=4 @@ -764,6 +764,6 @@ logical_plan physical_plan 01)ProjectionExec: expr=[empno@0 as empno, salary@1 as salary, lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as lead2] 02)--GlobalLimitExec: skip=0, fetch=3 -03)----BoundedWindowAggExec: wdw=[lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=5), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno, salary], file_type=csv, has_header=true diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index 605dfc15be3f..8417bd56852f 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -40,7 +40,7 @@ itertools = { workspace = true } object_store = { workspace = true } pbjson-types = { workspace = true } prost = { workspace = true } -substrait = { version = "0.58", features = ["serde"] } +substrait = { version = "0.59", features = ["serde"] } url = { workspace = true } tokio = { workspace = true, features = ["fs"] } uuid = { version = "1.17.0", features = ["v4"] } diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs index ecf465dd3f18..45a19cea80cf 100644 --- a/datafusion/substrait/src/physical_plan/consumer.rs +++ b/datafusion/substrait/src/physical_plan/consumer.rs @@ -151,8 +151,8 @@ pub async fn from_substrait_rel( .iter() .map(|item| item.field as usize) .collect(); - base_config_builder = - base_config_builder.with_projection(Some(column_indices)); + base_config_builder = base_config_builder + .with_projection_indices(Some(column_indices)); } } diff --git a/datafusion/substrait/src/physical_plan/producer.rs b/datafusion/substrait/src/physical_plan/producer.rs index 63abd14d6f5e..20d41c2e6112 100644 --- a/datafusion/substrait/src/physical_plan/producer.rs +++ b/datafusion/substrait/src/physical_plan/producer.rs @@ -92,11 +92,12 @@ pub fn to_substrait_rel( }; let mut select_struct = None; - if let Some(projection) = file_config.projection.as_ref() { + if let Some(projection) = file_config.projection_exprs.as_ref() { let struct_items = projection - .iter() + .column_indices() + .into_iter() .map(|index| StructItem { - field: *index as i32, + field: index as i32, // FIXME: duckdb sets this to None, but it's not clear why. // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1191 child: None, diff --git a/dev/update_config_docs.sh b/dev/update_config_docs.sh index ed3e699c1413..90bbc5d3bad0 100755 --- a/dev/update_config_docs.sh +++ b/dev/update_config_docs.sh @@ -175,6 +175,66 @@ SET datafusion.execution.batch_size = 1024; [`FairSpillPool`]: https://docs.rs/datafusion/latest/datafusion/execution/memory_pool/struct.FairSpillPool.html +## Join Queries + +Currently Apache Datafusion supports the following join algorithms: + +- Nested Loop Join +- Sort Merge Join +- Hash Join +- Symmetric Hash Join +- Piecewise Merge Join (experimental) + +The physical planner will choose the appropriate algorithm based on the statistics + join +condition of the two tables. + +# Join Algorithm Optimizer Configurations + +You can modify join optimization behavior in your queries by setting specific configuration values. +Use the following command to update a configuration: + +``` sql +SET datafusion.optimizer.; +``` + +Example + +``` sql +SET datafusion.optimizer.prefer_hash_join = false; +``` + +Adjusting the following configuration values influences how the optimizer selects the join algorithm +used to execute your SQL query: + +## Join Optimizer Configurations + +Adjusting the following configuration values influences how the optimizer selects the join algorithm +used to execute your SQL query. + +### allow_symmetric_joins_without_pruning (bool, default = true) + +Controls whether symmetric hash joins are allowed for unbounded data sources even when their inputs +lack ordering or filtering. + +- If disabled, the `SymmetricHashJoin` operator cannot prune its internal buffers to be produced only at the end of execution. + +### prefer_hash_join (bool, default = true) + +Determines whether the optimizer prefers Hash Join over Sort Merge Join during physical plan selection. + +- true: favors HashJoin for faster execution when sufficient memory is available. +- false: allows SortMergeJoin to be chosen when more memory-efficient execution is needed. + +### enable_piecewise_merge_join (bool, default = false) + +Enables the experimental Piecewise Merge Join algorithm. + +- When enabled, the physical planner may select PiecewiseMergeJoin if there is exactly one range + filter in the join condition. +- Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter + except for cases where it is joining two large tables (num_rows > 100,000) that are approximately + equal in size. + EOF diff --git a/docs/source/contributor-guide/benchmarking.md b/docs/source/contributor-guide/benchmarking.md new file mode 100644 index 000000000000..816e5bb2478c --- /dev/null +++ b/docs/source/contributor-guide/benchmarking.md @@ -0,0 +1,420 @@ +# Benchmarking + +This page describes the comprehensive benchmarking infrastructure available in Apache DataFusion to help contributors understand, leverage, and extend performance testing capabilities. + +## Overview + +DataFusion includes an extensive suite of benchmarks designed to measure performance across different workloads and use cases. These benchmarks help contributors: + +- Validate performance improvements in pull requests +- Identify performance regressions +- Compare DataFusion's performance against other engines +- Find appropriate places to add new benchmark code + +## Benchmark Categories + +### Performance Benchmarks + +#### TPCH (TPC-H Benchmark) + +Industry-standard decision support benchmark derived from TPC-H version 2.17.1. + +**Purpose**: Tests complex analytical queries with joins, aggregations, and sorting +**Data**: Synthetic business data (customers, orders, parts, suppliers) +**Usage**: + +```bash +# Generate data +./bench.sh data tpch + +# Run benchmark +./bench.sh run tpch + +# Run specific query (e.g., Q21) +./bench.sh run tpch10 21 +``` + +#### ClickBench + +Widely cited benchmark focusing on grouping, aggregation, and filtering operations. + +**Purpose**: Tests analytical query performance on real-world-like data +**Data**: Web analytics dataset +**Usage**: + +```bash +./bench.sh data clickbench +./bench.sh run clickbench +``` + +#### IMDB (Join Order Benchmark) + +Real-world movie database benchmark testing query optimization with skewed data. + +**Purpose**: Tests join ordering and cardinality estimation with realistic data distribution +**Data**: Internet Movie Database with correlated columns and data skew +**Usage**: + +```bash +./bench.sh data imdb +./bench.sh run imdb +``` + +#### H2O.ai Benchmarks + +Performance tests for groupby, join, and window operations with configurable data sizes. + +**Purpose**: Tests scalability across different data volumes +**Data Sizes**: Small (1e7), Medium (1e8), Big (1e9 rows) +**Usage**: + +```bash +# Groupby benchmarks +./bench.sh data h2o_small +./bench.sh run h2o_small + +# Join benchmarks +./bench.sh data h2o_small_join +./bench.sh run h2o_small_join + +# Window function benchmarks +./bench.sh data h2o_small_window +./bench.sh run h2o_small_window +``` + +### Specialized Benchmarks + +#### Sort Benchmarks + +Tests sorting performance on large datasets. + +**Sort TPCH**: End-to-end sorting on TPCH lineitem table + +```bash +./bench.sh run sort_tpch +./bench.sh run topk_tpch # TopK variant +``` + +**Sort**: General sorting performance on synthetic web server logs + +```bash +./bench.sh run sort +``` + +#### External Aggregation + +Tests aggregation performance with memory limits and spilling to disk. + +**Purpose**: Validates out-of-core aggregation performance +**Usage**: + +```bash +./bench.sh data external_aggr +./bench.sh run external_aggr +``` + +#### Parquet Filter + +Tests Parquet filter pushdown performance. + +**Purpose**: Measures filter pushdown optimization effectiveness +**Data**: Synthetic web server access logs +**Usage**: + +```bash +./bench.sh run parquet_filter +``` + +### Micro-benchmarks + +#### Hash Join + +Focuses specifically on hash join performance with minimal overhead. + +**Purpose**: Isolated hash join performance testing +**Data**: Uses `range()` table function +**Usage**: + +```bash +./bench.sh run hj +``` + +#### Nested Loop Join + +Tests nested loop join performance across various workloads. + +**Purpose**: Isolated nested loop join performance testing +**Usage**: + +```bash +./bench.sh run nlj +``` + +#### Cancellation + +Tests query cancellation performance and cleanup time. + +**Purpose**: Ensures queries stop executing quickly when cancelled +**Usage**: + +```bash +./bench.sh run cancellation +``` + +## Running Benchmarks + +### Using bench.sh Script (Recommended) + +The `bench.sh` script provides the easiest way to run benchmarks: + +```bash +# Navigate to benchmarks directory +cd benchmarks/ + +# Show usage +./bench.sh + +# Generate all datasets +./bench.sh data + +# Generate specific dataset +./bench.sh data tpch + +# Run all benchmarks +./bench.sh run + +# Run specific benchmark +./bench.sh run tpch + +# Compare results between branches +git checkout main +./bench.sh run tpch +git checkout my-branch +./bench.sh run tpch +./bench.sh compare main my-branch +``` + +### Using dfbench Binary Directly + +For more control, use the `dfbench` binary: + +```bash +# Build in release mode (required for accurate benchmarks) +cargo build --release --bin dfbench + +# Run TPCH benchmark +cargo run --release --bin dfbench -- tpch \ + --iterations 3 \ + --path ./data \ + --format parquet \ + --query 1 + +# Get help for specific benchmark +cargo run --release --bin dfbench -- tpch --help +``` + +### Memory Profiling + +Use `mem_profile` to measure memory usage: + +```bash +cargo run --profile release-nonlto --bin mem_profile -- \ + --bench-profile release-nonlto \ + tpch --path benchmarks/data/tpch_sf1 --partitions 4 --format parquet +``` + +## Criterion Benchmarks + +DataFusion uses Criterion for micro-benchmarks of individual components: + +```bash +# Run all criterion benchmarks +cargo bench + +# Run specific benchmark group +cargo bench --bench aggregate_query_sql +cargo bench --bench sort + +# Run with additional features +cargo bench --features jit +``` + +## Comparing Performance + +### Between Branches + +```bash +# Baseline on main branch +git checkout main +./benchmarks/bench.sh data +./benchmarks/bench.sh run tpch + +# Test branch performance +git checkout my-feature-branch +./benchmarks/bench.sh run tpch + +# Compare results +./benchmarks/bench.sh compare main my-feature-branch +``` + +### Using JSON Output + +```bash +# Generate JSON results +cargo run --release --bin dfbench -- tpch \ + --path ./data --format parquet \ + --output /tmp/results.json + +# Compare JSON files +./benchmarks/compare.py /tmp/baseline.json /tmp/feature.json +``` + +## Configuration Options + +### Environment Variables + +Configure DataFusion behavior during benchmarks: + +```bash +# Disable hash joins +PREFER_HASH_JOIN=false ./bench.sh run tpch + +# Disable join repartitioning +DATAFUSION_OPTIMIZER_REPARTITION_JOINS=false ./bench.sh run tpch + +# Enable debug logging +RUST_LOG=info ./bench.sh run tpch +``` + +### Memory Allocators + +Enable alternative allocators for performance testing: + +```bash +# Using mimalloc +cargo run --release --features "mimalloc" --bin dfbench -- tpch \ + --path ./data --format parquet + +# Using snmalloc +cargo run --release --features "snmalloc" --bin dfbench -- tpch \ + --path ./data --format parquet +``` + +## Adding New Benchmarks + +### Step 1: Shell Script Integration + +Add to `benchmarks/bench.sh`: + +```bash +# Add data generation function +data_my_benchmark() { + echo "Generating data for my_benchmark..." + # Data generation logic +} + +# Add run function +run_my_benchmark() { + echo "Running my_benchmark..." + cargo run --release --bin dfbench -- my-benchmark \ + --path "${DATA_DIR}" \ + --output "${RESULTS_FILE}" +} +``` + +### Step 2: dfbench Integration + +In `benchmarks/src/bin/dfbench.rs`: + +```rust +// Add to Options enum +enum Options { + // ... existing variants + MyBenchmark(my_benchmark::RunOpt), +} + +// Add to main function +match opt { + // ... existing matches + Options::MyBenchmark(opt) => opt.run().await?, +} +``` + +### Step 3: Implementation + +Create `benchmarks/src/my_benchmark.rs`: + +```rust +use crate::util::BenchmarkRun; +use structopt::StructOpt; + +#[derive(Debug, StructOpt)] +#[structopt(about = "Run my custom benchmark")] +pub struct RunOpt { + #[structopt(long, help = "Path to data directory")] + path: String, + + #[structopt(long, help = "Output file for results")] + output: Option, +} + +impl RunOpt { + pub async fn run(self) -> Result<()> { + let mut benchmark_run = BenchmarkRun::new(); + + // Benchmark implementation + benchmark_run.start_new_case("My Test Case"); + // ... run and time your benchmark + benchmark_run.write_iter(elapsed_time); + + benchmark_run.maybe_write_json(self.output.as_ref()); + Ok(()) + } +} +``` + +## Best Practices + +### For Contributors + +1. **Always use release builds** for performance testing +2. **Run multiple iterations** to account for variance +3. **Compare against baseline** before submitting PRs +4. **Document performance changes** in PR descriptions +5. **Use appropriate scale factors** for your testing environment + +### For Benchmark Development + +1. **Minimize non-benchmark overhead** in micro-benchmarks +2. **Use realistic data distributions** when possible +3. **Include both memory and disk-based scenarios** +4. **Test with different scale factors** +5. **Provide clear documentation** for new benchmarks + +## Troubleshooting + +### Common Issues + +**Out of memory errors**: + +```bash +# Reduce scale factor or increase memory limit +cargo run --release --bin dfbench -- tpch --path ./data --format parquet --memory-limit 4G +``` + +**Slow benchmark execution**: + +```bash +# Ensure release build +cargo build --release + +# Check system resources +htop +``` + +**Missing data**: + +```bash +# Regenerate benchmark data +./bench.sh data tpch +``` diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md index 383827893c70..df664975a84a 100644 --- a/docs/source/contributor-guide/index.md +++ b/docs/source/contributor-guide/index.md @@ -113,7 +113,7 @@ do take priority over the conventional commit approach, allowing maintainers to [conventional commits]: https://www.conventionalcommits.org/en/v1.0.0/ -# Reviewing Pull Requests +## Reviewing Pull Requests Some helpful links: @@ -175,3 +175,27 @@ The good thing about open code and open development is that any issues in one ch Pull requests will be marked with a `stale` label after 60 days of inactivity and then closed 7 days after that. Commenting on the PR will remove the `stale` label. + +## AI-Assisted contributions + +DataFusion has the following policy for AI-assisted PRs: + +- The PR author should **understand the core ideas** behind the implementation **end-to-end**, and be able to justify the design and code during review. +- **Calls out unknowns and assumptions**. It's okay to not fully understand some bits of AI generated code. You should comment on these cases and point them out to reviewers so that they can use their knowledge of the codebase to clear up any concerns. For example, you might comment "calling this function here seems to work but I'm not familiar with how it works internally, I wonder if there's a race condition if it is called concurrently". + +### Why fully AI-generated PRs without understanding are not helpful + +Today, AI tools cannot reliably make complex changes to DataFusion on their own, which is why we rely on pull requests and code review. + +The purposes of code review are: + +1. Finish the intended task. +2. Share knowledge between authors and reviewers, as a long-term investment in the project. For this reason, even if someone familiar with the codebase can finish a task quickly, we're still happy to help a new contributor work on it even if it takes longer. + +An AI dump for an issue doesn’t meet these purposes. Maintainers could finish the task faster by using AI directly, and the submitters gain little knowledge if they act only as a pass through AI proxy without understanding. + +Please understand the reviewing capacity is **very limited** for the project, so large PRs which appear to not have the requisite understanding might not get reviewed, and eventually closed or redirected. + +### Better ways to contribute than an “AI dump” + +It's recommended to write a high-quality issue with a clear problem statement and a minimal, reproducible example. This can make it easier for others to contribute. diff --git a/docs/source/contributor-guide/testing.md b/docs/source/contributor-guide/testing.md index dd22e1236081..cd72257f17d9 100644 --- a/docs/source/contributor-guide/testing.md +++ b/docs/source/contributor-guide/testing.md @@ -132,6 +132,8 @@ tested in the same way using the [doc_comment] crate. See the end of ## Benchmarks +For comprehensive information about all available benchmarks in DataFusion, see the dedicated {doc}`benchmarking` page. + ### Criterion Benchmarks [Criterion](https://docs.rs/criterion/latest/criterion/index.html) is a statistics-driven micro-benchmarking framework used by DataFusion for evaluating the performance of specific code-paths. In particular, the criterion benchmarks help to both guide optimisation efforts, and prevent performance regressions within DataFusion. diff --git a/docs/source/index.rst b/docs/source/index.rst index 6bb3c9485b71..70306612c3cb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -118,6 +118,7 @@ To get started, see user-guide/crate-configuration user-guide/cli/index user-guide/dataframe + user-guide/arrow-introduction user-guide/expressions user-guide/sql/index user-guide/configs @@ -157,6 +158,7 @@ To get started, see contributor-guide/development_environment contributor-guide/architecture contributor-guide/testing + contributor-guide/benchmarking contributor-guide/api-health contributor-guide/howtos contributor-guide/roadmap diff --git a/docs/source/library-user-guide/functions/adding-udfs.md b/docs/source/library-user-guide/functions/adding-udfs.md index ecb618179ea1..7581d8b6505e 100644 --- a/docs/source/library-user-guide/functions/adding-udfs.md +++ b/docs/source/library-user-guide/functions/adding-udfs.md @@ -586,6 +586,119 @@ For async UDF implementation details, see [`async_udf.rs`](https://github.com/ap [`process_scalar_func_inputs`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/functions/fn.process_scalar_func_inputs.html [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs +## Named Arguments + +DataFusion supports PostgreSQL-style named arguments for scalar functions, allowing you to pass arguments by parameter name: + +```sql +SELECT substr(str => 'hello', start_pos => 2, length => 3); +``` + +Named arguments can be mixed with positional arguments, but positional arguments must come first: + +```sql +SELECT substr('hello', start_pos => 2, length => 3); -- Valid +``` + +### Implementing Functions with Named Arguments + +To support named arguments in your UDF, add parameter names to your function's signature using `.with_parameter_names()`: + +```rust +# use arrow::datatypes::DataType; +# use datafusion_expr::{Signature, Volatility}; +# +# #[derive(Debug)] +# struct MyFunction { +# signature: Signature, +# } +# +impl MyFunction { + fn new() -> Self { + Self { + signature: Signature::uniform( + 2, + vec![DataType::Float64], + Volatility::Immutable + ) + .with_parameter_names(vec![ + "base".to_string(), + "exponent".to_string() + ]) + .expect("valid parameter names"), + } + } +} +``` + +The parameter names should match the order of arguments in your function's signature. DataFusion automatically resolves named arguments to the correct positional order before invoking your function. + +### Example + +```rust +# use std::sync::Arc; +# use std::any::Any; +# use arrow::datatypes::DataType; +# use datafusion_common::Result; +# use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility}; +# use datafusion_expr::ScalarUDFImpl; + +#[derive(Debug, PartialEq, Eq, Hash)] +struct PowerFunction { + signature: Signature, +} + +impl PowerFunction { + fn new() -> Self { + Self { + signature: Signature::uniform( + 2, + vec![DataType::Float64], + Volatility::Immutable + ) + .with_parameter_names(vec![ + "base".to_string(), + "exponent".to_string() + ]) + .expect("valid parameter names"), + } + } +} + +impl ScalarUDFImpl for PowerFunction { + fn as_any(&self) -> &dyn Any { self } + fn name(&self) -> &str { "power" } + fn signature(&self) -> &Signature { &self.signature } + + fn return_type(&self, _args: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + // Your implementation - arguments are in correct positional order + unimplemented!() + } +} +``` + +Once registered, users can call your function with named arguments: + +```sql +SELECT power(base => 2.0, exponent => 3.0); +SELECT power(2.0, exponent => 3.0); +``` + +### Error Messages + +When a function call fails due to incorrect arguments, DataFusion will show the parameter names in error messages to help users: + +```text +No function matches the given name and argument types substr(Utf8). + Candidate functions: + substr(str: Any, start_pos: Any) + substr(str: Any, start_pos: Any, length: Any) +``` + ## Adding a Window UDF Scalar UDFs are functions that take a row of data and return a single value. Window UDFs are similar, but they also have diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 8b03193e7f99..f34b8b2a5cf0 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -25,6 +25,15 @@ You can see the current [status of the `51.0.0`release here](https://github.com/apache/datafusion/issues/17558) +### `arrow` / `parquet` updated to 57.0.0 + +### Upgrade to arrow `57.0.0` and parquet `57.0.0` + +This version of DataFusion upgrades the underlying Apache Arrow implementation +to version `57.0.0`, including several dependent crates such as `prost`, +`tonic`, `pyo3`, and `substrait`. . See the [release +notes](https://github.com/apache/arrow-rs/releases/tag/57.0.0) for more details. + ### `MSRV` updated to 1.87.0 The Minimum Supported Rust Version (MSRV) has been updated to [`1.87.0`]. @@ -116,6 +125,141 @@ Users may need to update their paths to account for these changes. See [issue #17713] for more details. +### `FileScanConfig::projection` renamed to `FileScanConfig::projection_exprs` + +The `projection` field in `FileScanConfig` has been renamed to `projection_exprs` and its type has changed from `Option>` to `Option`. This change enables more powerful projection pushdown capabilities by supporting arbitrary physical expressions rather than just column indices. + +**Impact on direct field access:** + +If you directly access the `projection` field: + +```rust +# /* comment to avoid running +let config: FileScanConfig = ...; +let projection = config.projection; +# */ +``` + +You should update to: + +```rust +# /* comment to avoid running +let config: FileScanConfig = ...; +let projection_exprs = config.projection_exprs; +# */ +``` + +**Impact on builders:** + +The `FileScanConfigBuilder::with_projection()` method has been deprecated in favor of `with_projection_indices()`: + +```diff +let config = FileScanConfigBuilder::new(url, schema, file_source) +- .with_projection(Some(vec![0, 2, 3])) ++ .with_projection_indices(Some(vec![0, 2, 3])) + .build(); +``` + +Note: `with_projection()` still works but is deprecated and will be removed in a future release. + +**What is `ProjectionExprs`?** + +`ProjectionExprs` is a new type that represents a list of physical expressions for projection. While it can be constructed from column indices (which is what `with_projection_indices` does internally), it also supports arbitrary physical expressions, enabling advanced features like expression evaluation during scanning. + +You can access column indices from `ProjectionExprs` using its methods if needed: + +```rust +# /* comment to avoid running +let projection_exprs: ProjectionExprs = ...; +// Get the column indices if the projection only contains simple column references +let indices = projection_exprs.column_indices(); +# */ +``` + +### `DESCRIBE query` support + +`DESCRIBE query` was previously an alias for `EXPLAIN query`, which outputs the +_execution plan_ of the query. With this release, `DESCRIBE query` now outputs +the computed _schema_ of the query, consistent with the behavior of `DESCRIBE table_name`. + +### Introduction of `TableSchema` and changes to `FileSource::with_schema()` method + +A new `TableSchema` struct has been introduced in the `datafusion-datasource` crate to better manage table schemas with partition columns. This struct helps distinguish between: + +- **File schema**: The schema of actual data files on disk +- **Partition columns**: Columns derived from directory structure (e.g., Hive-style partitioning) +- **Table schema**: The complete schema combining both file and partition columns + +As part of this change, the `FileSource::with_schema()` method signature has changed from accepting a `SchemaRef` to accepting a `TableSchema`. + +**Who is affected:** + +- Users who have implemented custom `FileSource` implementations will need to update their code +- Users who only use built-in file sources (Parquet, CSV, JSON, AVRO, Arrow) are not affected + +**Migration guide for custom `FileSource` implementations:** + +```diff + use datafusion_datasource::file::FileSource; +-use arrow::datatypes::SchemaRef; ++use datafusion_datasource::TableSchema; + + impl FileSource for MyCustomSource { +- fn with_schema(&self, schema: SchemaRef) -> Arc { ++ fn with_schema(&self, schema: TableSchema) -> Arc { + Arc::new(Self { +- schema: Some(schema), ++ // Use schema.file_schema() to get the file schema without partition columns ++ schema: Some(Arc::clone(schema.file_schema())), + ..self.clone() + }) + } + } +``` + +For implementations that need access to partition columns: + +```rust,ignore +fn with_schema(&self, schema: TableSchema) -> Arc { + Arc::new(Self { + file_schema: Arc::clone(schema.file_schema()), + partition_cols: schema.table_partition_cols().clone(), + table_schema: Arc::clone(schema.table_schema()), + ..self.clone() + }) +} +``` + +**Note**: Most `FileSource` implementations only need to store the file schema (without partition columns), as shown in the first example. The second pattern of storing all three schema components is typically only needed for advanced use cases where you need access to different schema representations for different operations (e.g., ParquetSource uses the file schema for building pruning predicates but needs the table schema for filter pushdown logic). + +**Using `TableSchema` directly:** + +If you're constructing a `FileScanConfig` or working with table schemas and partition columns, you can now use `TableSchema`: + +```rust +use datafusion_datasource::TableSchema; +use arrow::datatypes::{Schema, Field, DataType}; +use std::sync::Arc; + +// Create a TableSchema with partition columns +let file_schema = Arc::new(Schema::new(vec![ + Field::new("user_id", DataType::Int64, false), + Field::new("amount", DataType::Float64, false), +])); + +let partition_cols = vec![ + Arc::new(Field::new("date", DataType::Utf8, false)), + Arc::new(Field::new("region", DataType::Utf8, false)), +]; + +let table_schema = TableSchema::new(file_schema, partition_cols); + +// Access different schema representations +let file_schema_ref = table_schema.file_schema(); // Schema without partition columns +let full_schema = table_schema.table_schema(); // Complete schema with partition columns +let partition_cols_ref = table_schema.table_partition_cols(); // Just the partition columns +``` + ## DataFusion `50.0.0` ### ListingTable automatically detects Hive Partitioned tables diff --git a/docs/source/user-guide/arrow-introduction.md b/docs/source/user-guide/arrow-introduction.md new file mode 100644 index 000000000000..89662a0c29c5 --- /dev/null +++ b/docs/source/user-guide/arrow-introduction.md @@ -0,0 +1,255 @@ + + +# Gentle Arrow Introduction + +```{contents} +:local: +:depth: 2 +``` + +## Overview + +DataFusion uses [Apache Arrow] as its native in-memory format, so anyone using DataFusion will likely interact with Arrow at some point. This guide introduces the key Arrow concepts you need to know to effectively use DataFusion. + +Apache Arrow defines a standardized columnar representation for in-memory data. This enables different systems and languages (e.g., Rust and Python) to share data with zero-copy interchange, avoiding serialization overhead. In addition to zero copy interchange, Arrow also standardizes best practice columnar data representation enabling high performance analytical processing through vectorized execution. + +## Columnar Layout + +Quick visual: row-major (left) vs Arrow's columnar layout (right). For a deeper primer, see the [arrow2 guide]. + +```text +Traditional Row Storage: Arrow Columnar Storage: +┌──────────────────┐ ┌─────────┬─────────┬──────────┐ +│ id │ name │ age │ │ id │ name │ age │ +├────┼──────┼──────┤ ├─────────┼─────────┼──────────┤ +│ 1 │ A │ 30 │ │ [1,2,3] │ [A,B,C] │[30,25,35]│ +│ 2 │ B │ 25 │ └─────────┴─────────┴──────────┘ +│ 3 │ C │ 35 │ ↑ ↑ ↑ +└──────────────────┘ Int32Array StringArray Int32Array +(read entire rows) (process entire columns at once) +``` + +## `RecordBatch` + +Arrow's standard unit for packaging data is the **[`RecordBatch`]**. + +A **[`RecordBatch`]** represents a horizontal slice of a table—a collection of equal-length columnar arrays that conform to a defined schema. Each column within the slice is a contiguous Arrow array, and all columns have the same number of rows (length). This chunked, immutable unit enables efficient streaming and parallel execution. + +Think of it as having two perspectives: + +- **Columnar inside**: Each column (`id`, `name`, `age`) is a contiguous array optimized for vectorized operations +- **Row-chunked externally**: The batch represents a chunk of rows (e.g., rows 1-1000), making it a manageable unit for streaming + +RecordBatches are **immutable snapshots**—once created, they cannot be modified. Any transformation produces a _new_ RecordBatch, enabling safe parallel processing without locks or coordination overhead. + +This design allows DataFusion to process streams of row-based chunks while gaining maximum performance from the columnar layout. + +## Streaming Through the Engine + +DataFusion processes queries as pull-based pipelines where operators request batches from their inputs. This streaming approach enables early result production, bounds memory usage (spilling to disk only when necessary), and naturally supports parallel execution across multiple CPU cores. + +For example, given the following query: + +```sql +SELECT name FROM 'data.parquet' WHERE id > 10 +``` + +The DataFusion Pipeline looks like this: + +```text + +┌─────────────┐ ┌──────────────┐ ┌────────────────┐ ┌──────────────────┐ ┌──────────┐ +│ Parquet │───▶│ Scan │───▶│ Filter │───▶│ Projection │───▶│ Results │ +│ File │ │ Operator │ │ Operator │ │ Operator │ │ │ +└─────────────┘ └──────────────┘ └────────────────┘ └──────────────────┘ └──────────┘ + (reads data) (id > 10) (keeps "name" col) + RecordBatch ───▶ RecordBatch ────▶ RecordBatch ────▶ RecordBatch +``` + +In this pipeline, [`RecordBatch`]es are the "packages" of columnar data that flow between the different stages of query execution. Each operator processes batches incrementally, enabling the system to produce results before reading the entire input. + +## Creating `ArrayRef` and `RecordBatch`es + +Sometimes you need to create Arrow data programmatically rather than reading from files. + +The first thing needed is creating an Arrow Array, for each column. [arrow-rs] provides array builders and `From` impls to create arrays from Rust vectors. + +```rust +use arrow::array::{StringArray, Int32Array}; +// Create an Int32Array from a vector of i32 values +let ids = Int32Array::from(vec![1, 2, 3]); +// There are similar constructors for other array types, e.g., StringArray, Float64Array, etc. +let names = StringArray::from(vec![Some("alice"), None, Some("carol")]); +``` + +Every element in an Arrow array can be "null" (aka missing). Often, arrays are +created from `Option` values to indicate nullability (e.g., `Some("alice")` +vs `None` above). + +Note: You'll see [`Arc`] used frequently in the code—Arrow arrays are wrapped in +[`Arc`] (atomically reference-counted pointers) to enable cheap, thread-safe +sharing across operators and tasks. [`ArrayRef`] is simply a type alias for +`Arc`. To create an `ArrayRef`, wrap your array in `Arc::new(...)` as shown below. + +```rust +use std::sync::Arc; +# use arrow::array::{ArrayRef, Int32Array, StringArray}; +// To get an ArrayRef, wrap the Int32Array in an Arc. +// (note you will often have to explicitly type annotate to ArrayRef) +let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); + +// you can also store Strings and other types in ArrayRefs +let arr: ArrayRef = Arc::new( + StringArray::from(vec![Some("alice"), None, Some("carol")]) +); +``` + +To create a [`RecordBatch`], you need to define its [`Schema`] (the column names and types) and provide the corresponding columns as [`ArrayRef`]s as shown below: + +```rust +# use std::sync::Arc; +# use arrow_schema::ArrowError; +# use arrow::array::{ArrayRef, Int32Array, StringArray, RecordBatch}; +use arrow_schema::{DataType, Field, Schema}; + +// Create the columns as Arrow arrays +let ids = Int32Array::from(vec![1, 2, 3]); +let names = StringArray::from(vec![Some("alice"), None, Some("carol")]); +// Create the schema +let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), // false means non-nullable + Field::new("name", DataType::Utf8, true), // true means nullable +])); +// Assemble the columns +let cols: Vec = vec![ + Arc::new(ids), + Arc::new(names) +]; +// Finally, create the RecordBatch +RecordBatch::try_new(schema, cols).expect("Failed to create RecordBatch"); +``` + +## Working with `ArrayRef` and `RecordBatch` + +Most DataFusion APIs are in terms of [`ArrayRef`] and [`RecordBatch`]. To work with the +underlying data, you typically downcast the [`ArrayRef`] to its concrete type +(e.g., [`Int32Array`]). + +To do so either use the `as_any().downcast_ref::()` method or the +`as_::()` helper method from the [AsArray] trait. + +[asarray]: https://docs.rs/arrow-array/latest/arrow_array/cast/trait.AsArray.html + +```rust +# use std::sync::Arc; +# use arrow::datatypes::{DataType, Int32Type}; +# use arrow::array::{AsArray, ArrayRef, Int32Array, RecordBatch}; +# let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); +// First check the data type of the array +match arr.data_type() { + &DataType::Int32 => { + // Downcast to Int32Array + let int_array = arr.as_primitive::(); + // Now you can access Int32Array methods + for i in 0..int_array.len() { + println!("Value at index {}: {}", i, int_array.value(i)); + } + } + _ => { + println ! ("Array is not of type Int32"); + } +} +``` + +The following two downcasting methods are equivalent: + +```rust +# use std::sync::Arc; +# use arrow::datatypes::{DataType, Int32Type}; +# use arrow::array::{AsArray, ArrayRef, Int32Array, RecordBatch}; +# let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); +// Downcast to Int32Array using as_any +let int_array1 = arr.as_any().downcast_ref::().unwrap(); +// This is the same as using the as_::() helper +let int_array2 = arr.as_primitive::(); +assert_eq!(int_array1, int_array2); +``` + +## Common Pitfalls + +When working with Arrow and RecordBatches, watch out for these common issues: + +- **Schema consistency**: All batches in a stream must share the exact same [`Schema`]. For example, you can't have one batch where a column is [`Int32`] and the next where it's [`Int64`], even if the values would fit +- **Immutability**: Arrays are immutable—to "modify" data, you must build new arrays or new RecordBatches. For instance, to change a value in an array, you'd create a new array with the updated value +- **Row by Row Processing**: Avoid iterating over Arrays element by element when possible, and use Arrow's built-in [compute kernels] instead +- **Type mismatches**: Mixed input types across files may require explicit casts. For example, a string column `"123"` from a CSV file won't automatically join with an integer column `123` from a Parquet file—you'll need to cast one to match the other. Use Arrow's [`cast`] kernel where appropriate +- **Batch size assumptions**: Don't assume a particular batch size; always iterate until the stream ends. One file might produce 8192-row batches while another produces 1024-row batches + +[compute kernels]: https://docs.rs/arrow/latest/arrow/compute/index.html + +## Further reading + +**Arrow Documentation:** + +- [Arrow Format Introduction](https://arrow.apache.org/docs/format/Intro.html) - Understand the Arrow specification and why it enables zero-copy data sharing +- [Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html) - Deep dive into memory layout for performance optimization +- [Arrow Rust Documentation](https://docs.rs/arrow/latest/arrow/) - Complete API reference for the Rust implementation + +**Key API References:** + +- [RecordBatch](https://docs.rs/arrow-array/latest/arrow_array/struct.RecordBatch.html) - The fundamental data structure for columnar data (a table slice) +- [ArrayRef](https://docs.rs/arrow-array/latest/arrow_array/array/type.ArrayRef.html) - Represents a reference-counted Arrow array (single column) +- [DataType](https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html) - Enum of all supported Arrow data types (e.g., Int32, Utf8) +- [Schema](https://docs.rs/arrow-schema/latest/arrow_schema/struct.Schema.html) - Describes the structure of a RecordBatch (column names and types) + +[apache arrow]: https://arrow.apache.org/docs/index.html +[`arc`]: https://doc.rust-lang.org/std/sync/struct.Arc.html +[`arrayref`]: https://docs.rs/arrow-array/latest/arrow_array/array/type.ArrayRef.html +[`cast`]: https://docs.rs/arrow/latest/arrow/compute/fn.cast.html +[`field`]: https://docs.rs/arrow-schema/latest/arrow_schema/struct.Field.html +[`schema`]: https://docs.rs/arrow-schema/latest/arrow_schema/struct.Schema.html +[`datatype`]: https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html +[`int32array`]: https://docs.rs/arrow-array/latest/arrow_array/array/struct.Int32Array.html +[`stringarray`]: https://docs.rs/arrow-array/latest/arrow_array/array/struct.StringArray.html +[`int32`]: https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html#variant.Int32 +[`int64`]: https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html#variant.Int64 +[extension points]: ../library-user-guide/extensions.md +[`tableprovider`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html +[custom table providers guide]: ../library-user-guide/custom-table-providers.md +[user-defined functions (udfs)]: ../library-user-guide/functions/adding-udfs.md +[custom optimizer rules and physical operators]: ../library-user-guide/extending-operators.md +[`executionplan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html +[`.register_table()`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.register_table +[`.sql()`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.sql +[`.show()`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.show +[`memtable`]: https://docs.rs/datafusion/latest/datafusion/datasource/struct.MemTable.html +[`sessioncontext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html +[`csvreadoptions`]: https://docs.rs/datafusion/latest/datafusion/execution/options/struct.CsvReadOptions.html +[`parquetreadoptions`]: https://docs.rs/datafusion/latest/datafusion/execution/options/struct.ParquetReadOptions.html +[`recordbatch`]: https://docs.rs/arrow-array/latest/arrow_array/struct.RecordBatch.html +[`read_csv`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_csv +[`read_parquet`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_parquet +[`read_json`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_json +[`read_avro`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_avro +[`dataframe`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html +[`.collect()`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.collect +[arrow2 guide]: https://jorgecarleitao.github.io/arrow2/main/guide/arrow.html#what-is-apache-arrow +[configuration settings]: configs.md +[`datafusion.execution.batch_size`]: configs.md#setting-configuration-options diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index c0e4ccd850d9..7ca5eb8f7be4 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -81,7 +81,7 @@ The following configuration settings are available: | datafusion.execution.parquet.enable_page_index | true | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. | | datafusion.execution.parquet.pruning | true | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | | datafusion.execution.parquet.skip_metadata | true | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | -| datafusion.execution.parquet.metadata_size_hint | NULL | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer | +| datafusion.execution.parquet.metadata_size_hint | 524288 | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed. | | datafusion.execution.parquet.pushdown_filters | false | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". | | datafusion.execution.parquet.reorder_filters | false | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | | datafusion.execution.parquet.schema_force_view_types | true | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. | @@ -253,3 +253,63 @@ SET datafusion.execution.batch_size = 1024; ``` [`fairspillpool`]: https://docs.rs/datafusion/latest/datafusion/execution/memory_pool/struct.FairSpillPool.html + +## Join Queries + +Currently Apache Datafusion supports the following join algorithms: + +- Nested Loop Join +- Sort Merge Join +- Hash Join +- Symmetric Hash Join +- Piecewise Merge Join (experimental) + +The physical planner will choose the appropriate algorithm based on the statistics + join +condition of the two tables. + +# Join Algorithm Optimizer Configurations + +You can modify join optimization behavior in your queries by setting specific configuration values. +Use the following command to update a configuration: + +```sql +SET datafusion.optimizer.; +``` + +Example + +```sql +SET datafusion.optimizer.prefer_hash_join = false; +``` + +Adjusting the following configuration values influences how the optimizer selects the join algorithm +used to execute your SQL query: + +## Join Optimizer Configurations + +Adjusting the following configuration values influences how the optimizer selects the join algorithm +used to execute your SQL query. + +### allow_symmetric_joins_without_pruning (bool, default = true) + +Controls whether symmetric hash joins are allowed for unbounded data sources even when their inputs +lack ordering or filtering. + +- If disabled, the `SymmetricHashJoin` operator cannot prune its internal buffers to be produced only at the end of execution. + +### prefer_hash_join (bool, default = true) + +Determines whether the optimizer prefers Hash Join over Sort Merge Join during physical plan selection. + +- true: favors HashJoin for faster execution when sufficient memory is available. +- false: allows SortMergeJoin to be chosen when more memory-efficient execution is needed. + +### enable_piecewise_merge_join (bool, default = false) + +Enables the experimental Piecewise Merge Join algorithm. + +- When enabled, the physical planner may select PiecewiseMergeJoin if there is exactly one range + filter in the join condition. +- Piecewise Merge Join is faster than Nested Loop Join performance wise for single range filter + except for cases where it is joining two large tables (num_rows > 100,000) that are approximately + equal in size. diff --git a/docs/source/user-guide/dataframe.md b/docs/source/user-guide/dataframe.md index 82f1eeb2823d..85724a72399a 100644 --- a/docs/source/user-guide/dataframe.md +++ b/docs/source/user-guide/dataframe.md @@ -19,6 +19,8 @@ # DataFrame API +## DataFrame overview + A DataFrame represents a logical set of rows with the same named columns, similar to a [Pandas DataFrame] or [Spark DataFrame]. diff --git a/docs/source/user-guide/metrics.md b/docs/source/user-guide/metrics.md index f2634b901518..1fb2f4a5c770 100644 --- a/docs/source/user-guide/metrics.md +++ b/docs/source/user-guide/metrics.md @@ -27,10 +27,11 @@ DataFusion operators expose runtime metrics so you can understand where time is `BaselineMetrics` are available in most physical operators to capture common measurements. -| Metric | Description | -| --------------- | ------------------------------------------------------ | -| elapsed_compute | CPU time the operator actively spends processing work. | -| output_rows | Total number of rows the operator produces. | +| Metric | Description | +| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| elapsed_compute | CPU time the operator actively spends processing work. | +| output_rows | Total number of rows the operator produces. | +| output_bytes | Memory usage of all output batches. Note: This value may be overestimated. If multiple output `RecordBatch` instances share underlying memory buffers, their sizes will be counted multiple times. | ## Operator-specific Metrics diff --git a/docs/source/user-guide/sql/data_types.md b/docs/source/user-guide/sql/data_types.md index d977a4396e40..02edb6371ce3 100644 --- a/docs/source/user-guide/sql/data_types.md +++ b/docs/source/user-guide/sql/data_types.md @@ -41,7 +41,18 @@ You can cast a SQL expression to a specific Arrow type using the `arrow_cast` fu For example, to cast the output of `now()` to a `Timestamp` with second precision: ```sql -select arrow_cast(now(), 'Timestamp(Second, None)'); +select arrow_cast(now(), 'Timestamp(s)') as "now()"; ++---------------------+ +| now() | ++---------------------+ +| 2025-10-24T20:02:45 | ++---------------------+ +``` + +The older syntax still works as well: + +```sql +select arrow_cast(now(), 'Timestamp(Second, None)') as "now()"; +---------------------+ | now() | +---------------------+ diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index d2e7066191f9..d090b5b70cda 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -2749,11 +2749,11 @@ to_local_time(expression) FROM ( SELECT '2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels' AS time ); -+---------------------------+------------------------------------------------+---------------------+-----------------------------+ -| time | type | to_local_time | to_local_time_type | -+---------------------------+------------------------------------------------+---------------------+-----------------------------+ -| 2024-04-01T00:00:20+02:00 | Timestamp(Nanosecond, Some("Europe/Brussels")) | 2024-04-01T00:00:20 | Timestamp(Nanosecond, None) | -+---------------------------+------------------------------------------------+---------------------+-----------------------------+ ++---------------------------+----------------------------------+---------------------+--------------------+ +| time | type | to_local_time | to_local_time_type | ++---------------------------+----------------------------------+---------------------+--------------------+ +| 2024-04-01T00:00:20+02:00 | Timestamp(ns, "Europe/Brussels") | 2024-04-01T00:00:20 | Timestamp(ns) | ++---------------------------+----------------------------------+---------------------+--------------------+ # combine `to_local_time()` with `date_bin()` to bin on boundaries in the timezone rather # than UTC boundaries @@ -2777,7 +2777,7 @@ FROM ( Converts a value to a timestamp (`YYYY-MM-DDT00:00:00Z`). Supports strings, integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats] are provided. Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp. -Note: `to_timestamp` returns `Timestamp(Nanosecond)`. The supported range for integer input is between `-9223372037` and `9223372036`. Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds. +Note: `to_timestamp` returns `Timestamp(ns)`. The supported range for integer input is between `-9223372037` and `9223372036`. Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds. ```sql to_timestamp(expression[, ..., format_n]) @@ -4182,7 +4182,8 @@ flatten(array) Similar to the range function, but it includes the upper bound. ```sql -generate_series(start, stop, step) +generate_series(stop) +generate_series(start, stop[, step]) ``` #### Arguments @@ -4402,7 +4403,8 @@ _Alias of [make_array](#make_array)._ Returns an Arrow array between start and stop with step. The range start..end contains all values with start <= x < end. It is empty if start >= end. Step cannot be 0. ```sql -range(start, stop, step) +range(stop) +range(start, stop[, step]) ``` #### Arguments @@ -4422,11 +4424,11 @@ range(start, stop, step) +-----------------------------------+ > select range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH); -+--------------------------------------------------------------+ -| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH) | -+--------------------------------------------------------------+ ++--------------------------------------------------------------------------+ +| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH) | ++--------------------------------------------------------------------------+ | [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] | -+--------------------------------------------------------------+ ++--------------------------------------------------------------------------+ ``` ### `string_to_array` @@ -4972,16 +4974,26 @@ arrow_cast(expression, datatype) #### Example ```sql -> select arrow_cast(-5, 'Int8') as a, +> select + arrow_cast(-5, 'Int8') as a, arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b, - arrow_cast('bar', 'LargeUtf8') as c, - arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d - ; -+----+-----+-----+---------------------------+ -| a | b | c | d | -+----+-----+-----+---------------------------+ -| -5 | foo | bar | 2023-01-02T12:53:02+08:00 | -+----+-----+-----+---------------------------+ + arrow_cast('bar', 'LargeUtf8') as c; + ++----+-----+-----+ +| a | b | c | ++----+-----+-----+ +| -5 | foo | bar | ++----+-----+-----+ + +> select + arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs, "+08:00")') as d, + arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs)') as e; + ++---------------------------+---------------------+ +| d | e | ++---------------------------+---------------------+ +| 2023-01-02T12:53:02+08:00 | 2023-01-02T12:53:02 | ++---------------------------+---------------------+ ``` ### `arrow_typeof`