diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt index 6578d5664cd30..c4cffd67b16a9 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-jdk11-results.txt @@ -2,251 +2,269 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz -SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 13405 13422 24 1.2 852.3 1.0X -SQL Json 10723 10788 92 1.5 681.7 1.3X -SQL Parquet Vectorized 164 217 50 95.9 10.4 81.8X -SQL Parquet MR 2349 2440 129 6.7 149.3 5.7X -SQL ORC Vectorized 312 346 23 50.4 19.8 43.0X -SQL ORC MR 1610 1659 69 9.8 102.4 8.3X +SQL CSV 9999 10058 83 1.6 635.7 1.0X +SQL Json 8857 8883 37 1.8 563.1 1.1X +SQL Parquet Vectorized 132 157 16 119.0 8.4 75.7X +SQL Parquet MR 1987 1997 14 7.9 126.3 5.0X +SQL ORC Vectorized 186 227 34 84.3 11.9 53.6X +SQL ORC MR 1559 1602 62 10.1 99.1 6.4X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized 110 117 9 143.0 7.0 1.0X +ParquetReader Vectorized -> Row 57 59 3 276.2 3.6 1.9X -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +SQL CSV 12897 12916 28 1.2 819.9 1.0X +SQL Json 9739 9770 44 1.6 619.2 1.3X +SQL Parquet Vectorized 226 237 14 69.7 14.3 57.2X +SQL Parquet MR 2124 2127 4 7.4 135.1 6.1X +SQL ORC Vectorized 213 250 39 73.9 13.5 60.6X +SQL ORC MR 1535 1548 19 10.2 97.6 8.4X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 187 209 20 84.3 11.9 1.0X -ParquetReader Vectorized -> Row 89 95 5 177.6 5.6 2.1X +ParquetReader Vectorized 259 269 15 60.6 16.5 1.0X +ParquetReader Vectorized -> Row 168 184 33 93.9 10.7 1.5X -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 14214 14549 474 1.1 903.7 1.0X -SQL Json 11866 11934 95 1.3 754.4 1.2X -SQL Parquet Vectorized 294 342 53 53.6 18.7 48.4X -SQL Parquet MR 2929 3004 107 5.4 186.2 4.9X -SQL ORC Vectorized 312 328 15 50.4 19.8 45.5X -SQL ORC MR 2037 2097 84 7.7 129.5 7.0X - -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 12765 12774 13 1.2 811.6 1.0X +SQL Json 10144 10158 21 1.6 644.9 1.3X +SQL Parquet Vectorized 168 208 34 93.7 10.7 76.1X +SQL Parquet MR 2443 2458 21 6.4 155.3 5.2X +SQL ORC Vectorized 300 313 16 52.4 19.1 42.5X +SQL ORC MR 1736 1780 62 9.1 110.4 7.4X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 249 266 18 63.1 15.8 1.0X -ParquetReader Vectorized -> Row 192 247 36 82.1 12.2 1.3X +ParquetReader Vectorized 229 239 9 68.6 14.6 1.0X +ParquetReader Vectorized -> Row 224 265 26 70.2 14.3 1.0X -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15502 15817 446 1.0 985.6 1.0X -SQL Json 12638 12646 11 1.2 803.5 1.2X -SQL Parquet Vectorized 193 256 44 81.7 12.2 80.5X -SQL Parquet MR 2943 2953 14 5.3 187.1 5.3X -SQL ORC Vectorized 324 370 34 48.5 20.6 47.8X -SQL ORC MR 2110 2163 75 7.5 134.1 7.3X - -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 14055 14060 6 1.1 893.6 1.0X +SQL Json 10692 10738 64 1.5 679.8 1.3X +SQL Parquet Vectorized 167 223 34 94.0 10.6 84.0X +SQL Parquet MR 2416 2482 94 6.5 153.6 5.8X +SQL ORC Vectorized 329 344 12 47.8 20.9 42.7X +SQL ORC MR 1773 1789 23 8.9 112.7 7.9X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ParquetReader Vectorized 276 287 14 57.0 17.6 1.0X -ParquetReader Vectorized -> Row 309 320 9 50.9 19.6 0.9X +ParquetReader Vectorized 232 239 9 67.9 14.7 1.0X +ParquetReader Vectorized -> Row 262 295 23 60.1 16.6 0.9X -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 20156 20694 761 0.8 1281.5 1.0X -SQL Json 15228 15380 214 1.0 968.2 1.3X -SQL Parquet Vectorized 325 346 20 48.4 20.7 62.0X -SQL Parquet MR 3144 3228 118 5.0 199.9 6.4X -SQL ORC Vectorized 516 526 7 30.5 32.8 39.0X -SQL ORC MR 2353 2367 19 6.7 149.6 8.6X - -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 18964 18975 17 0.8 1205.7 1.0X +SQL Json 13173 13189 23 1.2 837.5 1.4X +SQL Parquet Vectorized 278 290 11 56.6 17.7 68.2X +SQL Parquet MR 2565 2589 34 6.1 163.1 7.4X +SQL ORC Vectorized 432 481 48 36.4 27.5 43.9X +SQL ORC MR 2052 2061 12 7.7 130.5 9.2X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ParquetReader Vectorized 372 396 24 42.3 23.6 1.0X -ParquetReader Vectorized -> Row 437 462 25 36.0 27.8 0.9X +ParquetReader Vectorized 296 321 29 53.2 18.8 1.0X +ParquetReader Vectorized -> Row 329 335 7 47.7 20.9 0.9X -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17413 17599 263 0.9 1107.1 1.0X -SQL Json 14416 14453 53 1.1 916.5 1.2X -SQL Parquet Vectorized 181 225 35 86.8 11.5 96.1X -SQL Parquet MR 2940 2996 78 5.3 186.9 5.9X -SQL ORC Vectorized 470 494 29 33.5 29.9 37.1X -SQL ORC MR 2351 2379 39 6.7 149.5 7.4X - -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 15092 15095 5 1.0 959.5 1.0X +SQL Json 12166 12169 5 1.3 773.5 1.2X +SQL Parquet Vectorized 161 198 27 97.4 10.3 93.5X +SQL Parquet MR 2407 2412 6 6.5 153.0 6.3X +SQL ORC Vectorized 476 509 30 33.1 30.2 31.7X +SQL ORC MR 1978 1981 5 8.0 125.7 7.6X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ParquetReader Vectorized 268 282 14 58.7 17.0 1.0X -ParquetReader Vectorized -> Row 298 321 18 52.8 18.9 0.9X +ParquetReader Vectorized 256 261 9 61.4 16.3 1.0X +ParquetReader Vectorized -> Row 210 257 22 74.7 13.4 1.2X -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 21666 21697 43 0.7 1377.5 1.0X -SQL Json 18307 18363 79 0.9 1163.9 1.2X -SQL Parquet Vectorized 310 337 22 50.7 19.7 69.9X -SQL Parquet MR 3089 3103 19 5.1 196.4 7.0X -SQL ORC Vectorized 589 617 31 26.7 37.5 36.8X -SQL ORC MR 2307 2377 98 6.8 146.7 9.4X - -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 19785 19786 1 0.8 1257.9 1.0X +SQL Json 16339 16340 1 1.0 1038.8 1.2X +SQL Parquet Vectorized 284 302 19 55.4 18.1 69.7X +SQL Parquet MR 2570 2576 8 6.1 163.4 7.7X +SQL ORC Vectorized 473 519 32 33.3 30.0 41.9X +SQL ORC MR 2136 2142 9 7.4 135.8 9.3X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ParquetReader Vectorized 400 415 18 39.3 25.4 1.0X -ParquetReader Vectorized -> Row 393 406 11 40.1 25.0 1.0X +ParquetReader Vectorized 298 351 32 52.8 18.9 1.0X +ParquetReader Vectorized -> Row 370 375 9 42.5 23.5 0.8X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17703 17719 22 0.6 1688.3 1.0X -SQL Json 13095 13168 103 0.8 1248.9 1.4X -SQL Parquet Vectorized 2253 2266 19 4.7 214.8 7.9X -SQL Parquet MR 4913 4977 91 2.1 468.5 3.6X -SQL ORC Vectorized 2457 2467 14 4.3 234.3 7.2X -SQL ORC MR 4433 4464 44 2.4 422.8 4.0X +SQL CSV 13811 13824 18 0.8 1317.1 1.0X +SQL Json 11546 11589 61 0.9 1101.1 1.2X +SQL Parquet Vectorized 2143 2164 30 4.9 204.4 6.4X +SQL Parquet MR 4369 4386 24 2.4 416.7 3.2X +SQL ORC Vectorized 2289 2294 8 4.6 218.3 6.0X +SQL ORC MR 3770 3847 109 2.8 359.5 3.7X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9741 9804 89 1.1 929.0 1.0X -SQL Json 8230 8401 241 1.3 784.9 1.2X -SQL Parquet Vectorized 618 650 31 17.0 58.9 15.8X -SQL Parquet MR 2258 2311 75 4.6 215.4 4.3X -SQL ORC Vectorized 608 629 15 17.3 58.0 16.0X -SQL ORC MR 2466 2479 18 4.3 235.2 4.0X +SQL CSV 7344 7377 47 1.4 700.3 1.0X +SQL Json 7117 7153 51 1.5 678.7 1.0X +SQL Parquet Vectorized 598 618 18 17.5 57.0 12.3X +SQL Parquet MR 1955 1969 20 5.4 186.5 3.8X +SQL ORC Vectorized 559 565 8 18.8 53.3 13.1X +SQL ORC MR 1923 1932 13 5.5 183.4 3.8X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Data column - CSV 24195 24573 534 0.7 1538.3 1.0X -Data column - Json 14746 14883 194 1.1 937.5 1.6X -Data column - Parquet Vectorized 352 385 34 44.7 22.4 68.7X -Data column - Parquet MR 3674 3694 27 4.3 233.6 6.6X -Data column - ORC Vectorized 480 505 26 32.8 30.5 50.4X -Data column - ORC MR 2913 3004 128 5.4 185.2 8.3X -Partition column - CSV 7527 7544 23 2.1 478.6 3.2X -Partition column - Json 11955 12051 135 1.3 760.1 2.0X -Partition column - Parquet Vectorized 65 92 29 242.5 4.1 373.0X -Partition column - Parquet MR 1614 1628 21 9.7 102.6 15.0X -Partition column - ORC Vectorized 71 99 29 220.1 4.5 338.5X -Partition column - ORC MR 1761 1769 11 8.9 112.0 13.7X -Both columns - CSV 24077 24127 70 0.7 1530.8 1.0X -Both columns - Json 15286 15479 273 1.0 971.9 1.6X -Both columns - Parquet Vectorized 376 412 40 41.9 23.9 64.4X -Both columns - Parquet MR 3808 3826 26 4.1 242.1 6.4X -Both columns - ORC Vectorized 560 604 42 28.1 35.6 43.2X -Both columns - ORC MR 3046 3080 49 5.2 193.7 7.9X +Data column - CSV 19266 19281 21 0.8 1224.9 1.0X +Data column - Json 13119 13126 10 1.2 834.1 1.5X +Data column - Parquet Vectorized 305 334 27 51.6 19.4 63.2X +Data column - Parquet MR 2978 3022 63 5.3 189.3 6.5X +Data column - ORC Vectorized 446 480 32 35.3 28.3 43.2X +Data column - ORC MR 2451 2469 24 6.4 155.9 7.9X +Partition column - CSV 6640 6641 1 2.4 422.2 2.9X +Partition column - Json 10485 10512 37 1.5 666.6 1.8X +Partition column - Parquet Vectorized 65 88 24 241.2 4.1 295.4X +Partition column - Parquet MR 1403 1434 44 11.2 89.2 13.7X +Partition column - ORC Vectorized 62 86 21 253.8 3.9 310.9X +Partition column - ORC MR 1523 1525 3 10.3 96.8 12.6X +Both columns - CSV 19347 19354 10 0.8 1230.0 1.0X +Both columns - Json 13788 13793 6 1.1 876.6 1.4X +Both columns - Parquet Vectorized 346 414 70 45.5 22.0 55.7X +Both columns - Parquet MR 3022 3032 14 5.2 192.1 6.4X +Both columns - ORC Vectorized 479 519 28 32.9 30.4 40.2X +Both columns - ORC MR 2539 2540 1 6.2 161.4 7.6X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11805 12021 306 0.9 1125.8 1.0X -SQL Json 12051 12105 77 0.9 1149.3 1.0X -SQL Parquet Vectorized 1474 1545 100 7.1 140.6 8.0X -SQL Parquet MR 4488 4492 4 2.3 428.1 2.6X -ParquetReader Vectorized 1140 1140 1 9.2 108.7 10.4X -SQL ORC Vectorized 1164 1178 20 9.0 111.0 10.1X -SQL ORC MR 3745 3817 102 2.8 357.1 3.2X - -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 9158 9163 8 1.1 873.3 1.0X +SQL Json 10429 10448 27 1.0 994.6 0.9X +SQL Parquet Vectorized 1363 1660 420 7.7 130.0 6.7X +SQL Parquet MR 3894 3898 5 2.7 371.4 2.4X +ParquetReader Vectorized 1021 1031 14 10.3 97.4 9.0X +SQL ORC Vectorized 1168 1191 33 9.0 111.4 7.8X +SQL ORC MR 3267 3287 28 3.2 311.6 2.8X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 9814 9837 33 1.1 936.0 1.0X -SQL Json 9317 9445 182 1.1 888.5 1.1X -SQL Parquet Vectorized 1117 1155 52 9.4 106.6 8.8X -SQL Parquet MR 3463 3538 106 3.0 330.3 2.8X -ParquetReader Vectorized 1033 1039 8 10.1 98.6 9.5X -SQL ORC Vectorized 1307 1353 65 8.0 124.7 7.5X -SQL ORC MR 3644 3690 65 2.9 347.5 2.7X - -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 7570 7577 11 1.4 721.9 1.0X +SQL Json 8085 8096 14 1.3 771.1 0.9X +SQL Parquet Vectorized 1097 1101 5 9.6 104.7 6.9X +SQL Parquet MR 2999 3014 21 3.5 286.0 2.5X +ParquetReader Vectorized 1052 1064 18 10.0 100.3 7.2X +SQL ORC Vectorized 1286 2162 1239 8.2 122.6 5.9X +SQL ORC MR 3053 3123 100 3.4 291.1 2.5X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8145 8270 176 1.3 776.8 1.0X -SQL Json 5714 5764 71 1.8 544.9 1.4X -SQL Parquet Vectorized 235 264 15 44.6 22.4 34.7X -SQL Parquet MR 2398 2412 19 4.4 228.7 3.4X -ParquetReader Vectorized 248 262 11 42.3 23.6 32.9X -SQL ORC Vectorized 430 462 37 24.4 41.0 18.9X -SQL ORC MR 1983 1993 14 5.3 189.1 4.1X +SQL CSV 6211 6214 3 1.7 592.4 1.0X +SQL Json 4977 4994 24 2.1 474.6 1.2X +SQL Parquet Vectorized 260 272 10 40.3 24.8 23.9X +SQL Parquet MR 1981 1985 5 5.3 188.9 3.1X +ParquetReader Vectorized 268 276 11 39.1 25.6 23.2X +SQL ORC Vectorized 428 457 35 24.5 40.8 14.5X +SQL ORC MR 1696 1705 12 6.2 161.8 3.7X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 2448 2461 18 0.4 2334.3 1.0X -SQL Json 3332 3370 53 0.3 3177.6 0.7X -SQL Parquet Vectorized 51 87 25 20.7 48.2 48.4X -SQL Parquet MR 239 278 35 4.4 227.5 10.3X -SQL ORC Vectorized 60 82 19 17.5 57.3 40.8X -SQL ORC MR 197 219 26 5.3 188.3 12.4X - -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 2067 2093 36 0.5 1971.6 1.0X +SQL Json 3047 5663 NaN 0.3 2906.0 0.7X +SQL Parquet Vectorized 50 73 21 20.9 47.7 41.3X +SQL Parquet MR 205 224 28 5.1 195.3 10.1X +SQL ORC Vectorized 60 79 23 17.4 57.5 34.3X +SQL ORC MR 173 196 25 6.1 165.1 11.9X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 6034 6061 39 0.2 5754.0 1.0X -SQL Json 12232 12315 118 0.1 11665.4 0.5X -SQL Parquet Vectorized 73 120 30 14.4 69.6 82.6X -SQL Parquet MR 316 368 44 3.3 301.1 19.1X -SQL ORC Vectorized 76 122 36 13.7 72.9 79.0X -SQL ORC MR 206 261 47 5.1 196.5 29.3X - -OpenJDK 64-Bit Server VM 11.0.10+9-LTS on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +SQL CSV 4841 4844 5 0.2 4616.4 1.0X +SQL Json 11721 11745 34 0.1 11177.9 0.4X +SQL Parquet Vectorized 67 101 27 15.7 63.8 72.4X +SQL Parquet MR 225 247 27 4.7 214.2 21.5X +SQL ORC Vectorized 75 99 26 13.9 71.7 64.4X +SQL ORC MR 192 219 26 5.5 183.4 25.2X + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1021-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10307 10309 4 0.1 9829.0 1.0X -SQL Json 23412 23539 180 0.0 22327.7 0.4X -SQL Parquet Vectorized 105 151 23 10.0 99.9 98.4X -SQL Parquet MR 295 325 29 3.6 281.5 34.9X -SQL ORC Vectorized 85 112 31 12.4 81.0 121.4X -SQL ORC MR 212 255 66 4.9 202.3 48.6X +SQL CSV 8410 8414 5 0.1 8020.8 1.0X +SQL Json 22537 22923 547 0.0 21492.8 0.4X +SQL Parquet Vectorized 101 141 32 10.4 96.2 83.4X +SQL Parquet MR 262 289 45 4.0 249.9 32.1X +SQL ORC Vectorized 90 113 32 11.7 85.4 93.9X +SQL ORC MR 210 232 36 5.0 200.3 40.0X diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt index fe083703ae0ea..65db1afc51193 100644 --- a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt +++ b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt @@ -2,251 +2,269 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz -SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15943 15956 18 1.0 1013.6 1.0X -SQL Json 9109 9158 70 1.7 579.1 1.8X -SQL Parquet Vectorized 168 191 16 93.8 10.7 95.1X -SQL Parquet MR 1938 1950 17 8.1 123.2 8.2X -SQL ORC Vectorized 191 199 6 82.2 12.2 83.3X -SQL ORC MR 1523 1537 20 10.3 96.8 10.5X +SQL CSV 11497 11744 349 1.4 731.0 1.0X +SQL Json 7073 7099 37 2.2 449.7 1.6X +SQL Parquet Vectorized 105 126 17 149.9 6.7 109.6X +SQL Parquet MR 1647 1648 2 9.6 104.7 7.0X +SQL ORC Vectorized 157 167 5 100.0 10.0 73.1X +SQL ORC MR 1466 1485 27 10.7 93.2 7.8X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Parquet Reader Single BOOLEAN Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------- +ParquetReader Vectorized 114 123 8 137.8 7.3 1.0X +ParquetReader Vectorized -> Row 42 44 1 372.1 2.7 2.7X -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +SQL CSV 15825 15961 193 1.0 1006.1 1.0X +SQL Json 7966 8054 125 2.0 506.5 2.0X +SQL Parquet Vectorized 136 148 9 115.4 8.7 116.1X +SQL Parquet MR 1814 1825 15 8.7 115.4 8.7X +SQL ORC Vectorized 138 147 6 114.4 8.7 115.1X +SQL ORC MR 1299 1382 117 12.1 82.6 12.2X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 203 206 3 77.5 12.9 1.0X -ParquetReader Vectorized -> Row 97 100 2 161.6 6.2 2.1X +ParquetReader Vectorized 179 185 9 88.0 11.4 1.0X +ParquetReader Vectorized -> Row 91 101 3 172.6 5.8 2.0X -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 17062 17089 38 0.9 1084.8 1.0X -SQL Json 9718 9720 3 1.6 617.9 1.8X -SQL Parquet Vectorized 326 333 7 48.2 20.7 52.3X -SQL Parquet MR 2305 2329 34 6.8 146.6 7.4X -SQL ORC Vectorized 201 205 3 78.2 12.8 84.8X -SQL ORC MR 1795 1796 0 8.8 114.1 9.5X - -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 15449 16211 1077 1.0 982.2 1.0X +SQL Json 7955 8292 476 2.0 505.8 1.9X +SQL Parquet Vectorized 195 211 8 80.7 12.4 79.2X +SQL Parquet MR 1866 1890 33 8.4 118.7 8.3X +SQL ORC Vectorized 163 173 8 96.6 10.4 94.9X +SQL ORC MR 1550 1555 8 10.1 98.5 10.0X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -ParquetReader Vectorized 333 339 7 47.2 21.2 1.0X -ParquetReader Vectorized -> Row 283 285 3 55.7 18.0 1.2X +ParquetReader Vectorized 299 302 4 52.5 19.0 1.0X +ParquetReader Vectorized -> Row 264 280 14 59.6 16.8 1.1X -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 18722 18809 123 0.8 1190.3 1.0X -SQL Json 10192 10249 80 1.5 648.0 1.8X -SQL Parquet Vectorized 155 162 8 101.6 9.8 120.9X -SQL Parquet MR 2348 2360 16 6.7 149.3 8.0X -SQL ORC Vectorized 265 275 7 59.3 16.9 70.5X -SQL ORC MR 1892 1938 65 8.3 120.3 9.9X - -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 16640 16834 273 0.9 1058.0 1.0X +SQL Json 8859 8862 3 1.8 563.3 1.9X +SQL Parquet Vectorized 144 155 8 109.0 9.2 115.3X +SQL Parquet MR 1960 2023 89 8.0 124.6 8.5X +SQL ORC Vectorized 218 233 11 72.3 13.8 76.5X +SQL ORC MR 1440 1442 3 10.9 91.6 11.6X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ParquetReader Vectorized 243 251 7 64.8 15.4 1.0X -ParquetReader Vectorized -> Row 222 229 5 70.9 14.1 1.1X +ParquetReader Vectorized 224 241 13 70.2 14.2 1.0X +ParquetReader Vectorized -> Row 214 221 10 73.6 13.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 24299 24358 84 0.6 1544.9 1.0X -SQL Json 13349 13429 114 1.2 848.7 1.8X -SQL Parquet Vectorized 215 241 59 73.3 13.6 113.2X -SQL Parquet MR 2508 2508 0 6.3 159.4 9.7X -SQL ORC Vectorized 323 330 6 48.7 20.5 75.2X -SQL ORC MR 1993 2009 22 7.9 126.7 12.2X - -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 22998 23324 461 0.7 1462.2 1.0X +SQL Json 12165 12179 20 1.3 773.4 1.9X +SQL Parquet Vectorized 237 265 69 66.3 15.1 96.9X +SQL Parquet MR 2199 2199 0 7.2 139.8 10.5X +SQL ORC Vectorized 303 311 10 51.9 19.3 76.0X +SQL ORC MR 1750 1763 18 9.0 111.3 13.1X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ParquetReader Vectorized 310 351 74 50.8 19.7 1.0X -ParquetReader Vectorized -> Row 281 297 8 55.9 17.9 1.1X +ParquetReader Vectorized 331 368 80 47.6 21.0 1.0X +ParquetReader Vectorized -> Row 314 318 6 50.0 20.0 1.1X -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 19745 19811 93 0.8 1255.4 1.0X -SQL Json 12523 12760 335 1.3 796.2 1.6X -SQL Parquet Vectorized 153 160 6 102.9 9.7 129.2X -SQL Parquet MR 2325 2338 18 6.8 147.8 8.5X -SQL ORC Vectorized 389 401 8 40.5 24.7 50.8X -SQL ORC MR 2009 2009 1 7.8 127.7 9.8X - -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 17442 18560 1581 0.9 1108.9 1.0X +SQL Json 10833 11056 315 1.5 688.8 1.6X +SQL Parquet Vectorized 150 162 10 105.0 9.5 116.5X +SQL Parquet MR 1804 1922 167 8.7 114.7 9.7X +SQL ORC Vectorized 317 336 20 49.6 20.2 55.0X +SQL ORC MR 1550 1648 139 10.1 98.5 11.3X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ParquetReader Vectorized 240 244 4 65.5 15.3 1.0X -ParquetReader Vectorized -> Row 223 230 6 70.5 14.2 1.1X +ParquetReader Vectorized 240 263 11 65.7 15.2 1.0X +ParquetReader Vectorized -> Row 224 235 15 70.4 14.2 1.1X -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 27223 27293 99 0.6 1730.8 1.0X -SQL Json 18601 18646 63 0.8 1182.6 1.5X -SQL Parquet Vectorized 247 251 3 63.8 15.7 110.4X -SQL Parquet MR 2724 2773 69 5.8 173.2 10.0X -SQL ORC Vectorized 474 484 10 33.2 30.1 57.4X -SQL ORC MR 2342 2368 37 6.7 148.9 11.6X - -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 22438 23472 1462 0.7 1426.5 1.0X +SQL Json 15839 15888 70 1.0 1007.0 1.4X +SQL Parquet Vectorized 215 229 12 73.3 13.6 104.6X +SQL Parquet MR 1928 2061 188 8.2 122.6 11.6X +SQL ORC Vectorized 393 421 17 40.0 25.0 57.0X +SQL ORC MR 1799 1814 22 8.7 114.4 12.5X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parquet Reader Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -ParquetReader Vectorized 326 335 13 48.3 20.7 1.0X -ParquetReader Vectorized -> Row 358 365 7 44.0 22.7 0.9X +ParquetReader Vectorized 310 316 9 50.7 19.7 1.0X +ParquetReader Vectorized -> Row 289 302 20 54.3 18.4 1.1X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 18706 18716 15 0.6 1783.9 1.0X -SQL Json 12665 12762 138 0.8 1207.8 1.5X -SQL Parquet Vectorized 2408 2419 15 4.4 229.6 7.8X -SQL Parquet MR 4599 4620 30 2.3 438.6 4.1X -SQL ORC Vectorized 2397 2400 3 4.4 228.6 7.8X -SQL ORC MR 4267 4288 30 2.5 406.9 4.4X +SQL CSV 15669 15869 283 0.7 1494.3 1.0X +SQL Json 10126 10559 613 1.0 965.7 1.5X +SQL Parquet Vectorized 2056 2064 11 5.1 196.0 7.6X +SQL Parquet MR 3918 3927 13 2.7 373.6 4.0X +SQL ORC Vectorized 1786 1887 143 5.9 170.3 8.8X +SQL ORC MR 3521 3555 48 3.0 335.8 4.4X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10822 10838 23 1.0 1032.0 1.0X -SQL Json 7459 7488 41 1.4 711.4 1.5X -SQL Parquet Vectorized 875 895 26 12.0 83.5 12.4X -SQL Parquet MR 1976 2002 37 5.3 188.4 5.5X -SQL ORC Vectorized 533 539 8 19.7 50.9 20.3X -SQL ORC MR 2191 2194 5 4.8 208.9 4.9X +SQL CSV 8659 8948 409 1.2 825.8 1.0X +SQL Json 6410 6536 177 1.6 611.3 1.4X +SQL Parquet Vectorized 655 709 47 16.0 62.4 13.2X +SQL Parquet MR 1528 1531 3 6.9 145.7 5.7X +SQL ORC Vectorized 388 416 24 27.0 37.0 22.3X +SQL ORC MR 1599 1700 142 6.6 152.5 5.4X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Data column - CSV 31196 31449 359 0.5 1983.4 1.0X -Data column - Json 16118 16855 1041 1.0 1024.8 1.9X -Data column - Parquet Vectorized 243 251 9 64.8 15.4 128.4X -Data column - Parquet MR 4213 4288 106 3.7 267.8 7.4X -Data column - ORC Vectorized 335 341 4 46.9 21.3 93.1X -Data column - ORC MR 3119 3146 38 5.0 198.3 10.0X -Partition column - CSV 9616 9915 423 1.6 611.3 3.2X -Partition column - Json 14136 14164 39 1.1 898.8 2.2X -Partition column - Parquet Vectorized 64 70 6 243.9 4.1 483.8X -Partition column - Parquet MR 1954 1980 38 8.1 124.2 16.0X -Partition column - ORC Vectorized 67 74 8 233.4 4.3 462.9X -Partition column - ORC MR 2461 2479 26 6.4 156.4 12.7X -Both columns - CSV 30327 30666 479 0.5 1928.2 1.0X -Both columns - Json 18656 18789 188 0.8 1186.1 1.7X -Both columns - Parquet Vectorized 291 297 7 54.0 18.5 107.2X -Both columns - Parquet MR 4430 4443 19 3.6 281.6 7.0X -Both columns - ORC Vectorized 403 411 11 39.0 25.6 77.4X -Both columns - ORC MR 3580 3584 5 4.4 227.6 8.7X +Data column - CSV 21094 21357 372 0.7 1341.1 1.0X +Data column - Json 11163 11434 383 1.4 709.7 1.9X +Data column - Parquet Vectorized 225 238 13 69.9 14.3 93.7X +Data column - Parquet MR 2218 2342 175 7.1 141.0 9.5X +Data column - ORC Vectorized 276 300 20 56.9 17.6 76.4X +Data column - ORC MR 1851 1863 17 8.5 117.7 11.4X +Partition column - CSV 5834 6119 403 2.7 370.9 3.6X +Partition column - Json 9746 9754 11 1.6 619.6 2.2X +Partition column - Parquet Vectorized 57 61 2 273.9 3.7 367.4X +Partition column - Parquet MR 1164 1167 5 13.5 74.0 18.1X +Partition column - ORC Vectorized 60 64 3 261.3 3.8 350.4X +Partition column - ORC MR 1298 1304 8 12.1 82.5 16.2X +Both columns - CSV 22632 22636 4 0.7 1438.9 0.9X +Both columns - Json 12568 12587 26 1.3 799.1 1.7X +Both columns - Parquet Vectorized 283 288 7 55.5 18.0 74.4X +Both columns - Parquet MR 2547 2553 8 6.2 161.9 8.3X +Both columns - ORC Vectorized 343 346 4 45.8 21.8 61.5X +Both columns - ORC MR 2177 2178 2 7.2 138.4 9.7X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 15606 15614 11 0.7 1488.3 1.0X -SQL Json 15406 15451 63 0.7 1469.3 1.0X -SQL Parquet Vectorized 1555 1573 25 6.7 148.3 10.0X -SQL Parquet MR 5369 5377 11 2.0 512.0 2.9X -ParquetReader Vectorized 1145 1150 7 9.2 109.2 13.6X -SQL ORC Vectorized 1023 1027 6 10.2 97.6 15.3X -SQL ORC MR 4421 4542 172 2.4 421.6 3.5X - -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 11364 11364 0 0.9 1083.7 1.0X +SQL Json 10555 10562 9 1.0 1006.6 1.1X +SQL Parquet Vectorized 1299 1309 13 8.1 123.9 8.7X +SQL Parquet MR 3350 3351 1 3.1 319.5 3.4X +ParquetReader Vectorized 983 987 5 10.7 93.8 11.6X +SQL ORC Vectorized 912 913 1 11.5 87.0 12.5X +SQL ORC MR 3056 3059 5 3.4 291.4 3.7X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 11096 11159 90 0.9 1058.2 1.0X -SQL Json 10797 11304 717 1.0 1029.7 1.0X -SQL Parquet Vectorized 1218 1230 16 8.6 116.2 9.1X -SQL Parquet MR 3778 3806 40 2.8 360.3 2.9X -ParquetReader Vectorized 1108 1118 14 9.5 105.7 10.0X -SQL ORC Vectorized 1361 1371 13 7.7 129.8 8.2X -SQL ORC MR 4186 4196 14 2.5 399.2 2.7X - -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 8651 8654 5 1.2 825.0 1.0X +SQL Json 7791 7794 4 1.3 743.0 1.1X +SQL Parquet Vectorized 1045 1055 15 10.0 99.7 8.3X +SQL Parquet MR 2516 2519 3 4.2 240.0 3.4X +ParquetReader Vectorized 927 933 6 11.3 88.4 9.3X +SQL ORC Vectorized 1285 1286 2 8.2 122.5 6.7X +SQL ORC MR 3013 3013 0 3.5 287.4 2.9X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 8803 8866 90 1.2 839.5 1.0X -SQL Json 7220 7249 42 1.5 688.5 1.2X -SQL Parquet Vectorized 258 265 7 40.6 24.6 34.1X -SQL Parquet MR 2760 2761 0 3.8 263.2 3.2X -ParquetReader Vectorized 277 283 5 37.8 26.4 31.7X -SQL ORC Vectorized 514 522 6 20.4 49.1 17.1X -SQL ORC MR 2523 2591 96 4.2 240.6 3.5X +SQL CSV 6272 6288 23 1.7 598.1 1.0X +SQL Json 4469 4469 0 2.3 426.2 1.4X +SQL Parquet Vectorized 231 235 7 45.4 22.0 27.2X +SQL Parquet MR 1673 1674 2 6.3 159.5 3.7X +ParquetReader Vectorized 243 244 3 43.1 23.2 25.8X +SQL ORC Vectorized 471 472 2 22.2 45.0 13.3X +SQL ORC MR 1606 1618 17 6.5 153.2 3.9X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 3022 3032 14 0.3 2881.9 1.0X -SQL Json 4047 4051 5 0.3 3859.5 0.7X -SQL Parquet Vectorized 50 54 6 20.8 48.1 59.9X -SQL Parquet MR 299 301 2 3.5 285.0 10.1X -SQL ORC Vectorized 59 63 11 17.9 55.9 51.6X -SQL ORC MR 255 259 5 4.1 243.4 11.8X - -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 2171 2173 2 0.5 2070.8 1.0X +SQL Json 2266 2278 17 0.5 2161.3 1.0X +SQL Parquet Vectorized 51 55 7 20.4 49.0 42.2X +SQL Parquet MR 190 192 2 5.5 180.9 11.4X +SQL ORC Vectorized 57 61 8 18.4 54.2 38.2X +SQL ORC MR 161 164 2 6.5 153.8 13.5X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 50 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 7250 7252 3 0.1 6914.4 1.0X -SQL Json 15641 15718 109 0.1 14916.8 0.5X -SQL Parquet Vectorized 66 72 8 15.9 62.9 110.0X -SQL Parquet MR 320 323 3 3.3 305.0 22.7X -SQL ORC Vectorized 72 77 11 14.6 68.6 100.9X -SQL ORC MR 269 273 5 3.9 256.8 26.9X - -OpenJDK 64-Bit Server VM 1.8.0_282-b08 on Linux 5.4.0-1043-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +SQL CSV 5200 5211 15 0.2 4959.5 1.0X +SQL Json 8312 8318 8 0.1 7927.1 0.6X +SQL Parquet Vectorized 67 73 10 15.7 63.9 77.6X +SQL Parquet MR 210 214 4 5.0 200.4 24.8X +SQL ORC Vectorized 70 77 16 15.0 66.7 74.3X +SQL ORC MR 182 184 2 5.8 173.6 28.6X + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1020-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SQL CSV 10962 11340 535 0.1 10454.1 1.0X -SQL Json 24951 25755 1137 0.0 23795.0 0.4X -SQL Parquet Vectorized 84 93 6 12.4 80.5 129.9X -SQL Parquet MR 280 296 14 3.7 266.8 39.2X -SQL ORC Vectorized 70 76 6 15.0 66.6 156.9X -SQL ORC MR 231 242 13 4.5 220.1 47.5X +SQL CSV 9030 9032 2 0.1 8611.8 1.0X +SQL Json 15429 15462 46 0.1 14714.5 0.6X +SQL Parquet Vectorized 91 97 8 11.5 87.2 98.8X +SQL Parquet MR 235 239 3 4.5 224.2 38.4X +SQL ORC Vectorized 80 84 9 13.1 76.4 112.8X +SQL ORC MR 192 201 7 5.5 183.4 47.0X diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java index 39591be3b4be4..0eb5d65a4a8f7 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java @@ -53,19 +53,52 @@ public void skip() { throw new UnsupportedOperationException(); } + private void updateCurrentByte() { + try { + currentByte = (byte) in.read(); + } catch (IOException e) { + throw new ParquetDecodingException("Failed to read a byte", e); + } + } + @Override public final void readBooleans(int total, WritableColumnVector c, int rowId) { - // TODO: properly vectorize this - for (int i = 0; i < total; i++) { - c.putBoolean(rowId + i, readBoolean()); + int i = 0; + if (bitOffset > 0) { + i = Math.min(8 - bitOffset, total); + c.putBooleans(rowId, i, currentByte, bitOffset); + bitOffset = (bitOffset + i) & 7; + } + for (; i + 7 < total; i += 8) { + updateCurrentByte(); + c.putBooleans(rowId + i, currentByte); + } + if (i < total) { + updateCurrentByte(); + bitOffset = total - i; + c.putBooleans(rowId + i, bitOffset, currentByte, 0); } } @Override public final void skipBooleans(int total) { - // TODO: properly vectorize this - for (int i = 0; i < total; i++) { - readBoolean(); + int i = 0; + if (bitOffset > 0) { + i = Math.min(8 - bitOffset, total); + bitOffset = (bitOffset + i) & 7; + } + if (i + 7 < total) { + int numBytesToSkip = (total - i) / 8; + try { + in.skipFully(numBytesToSkip); + } catch (IOException e) { + throw new ParquetDecodingException("Failed to skip bytes", e); + } + i += numBytesToSkip * 8; + } + if (i < total) { + updateCurrentByte(); + bitOffset = total - i; } } @@ -276,13 +309,8 @@ public void skipShorts(int total) { @Override public final boolean readBoolean() { - // TODO: vectorize decoding and keep boolean[] instead of currentByte if (bitOffset == 0) { - try { - currentByte = (byte) in.read(); - } catch (IOException e) { - throw new ParquetDecodingException("Failed to read a byte", e); - } + updateCurrentByte(); } boolean v = (currentByte & (1 << bitOffset)) != 0; diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java index f7c9dc55f7eca..bbe96819a618b 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java @@ -152,6 +152,18 @@ public void putBooleans(int rowId, int count, boolean value) { } } + @Override + public void putBooleans(int rowId, byte src) { + Platform.putByte(null, data + rowId, (byte)(src & 1)); + Platform.putByte(null, data + rowId + 1, (byte)(src >>> 1 & 1)); + Platform.putByte(null, data + rowId + 2, (byte)(src >>> 2 & 1)); + Platform.putByte(null, data + rowId + 3, (byte)(src >>> 3 & 1)); + Platform.putByte(null, data + rowId + 4, (byte)(src >>> 4 & 1)); + Platform.putByte(null, data + rowId + 5, (byte)(src >>> 5 & 1)); + Platform.putByte(null, data + rowId + 6, (byte)(src >>> 6 & 1)); + Platform.putByte(null, data + rowId + 7, (byte)(src >>> 7 & 1)); + } + @Override public boolean getBoolean(int rowId) { return Platform.getByte(null, data + rowId) == 1; } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java index 3fb96d872cd8b..833a93f2a2bdb 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java @@ -147,6 +147,18 @@ public void putBooleans(int rowId, int count, boolean value) { } } + @Override + public void putBooleans(int rowId, byte src) { + byteData[rowId] = (byte)(src & 1); + byteData[rowId + 1] = (byte)(src >>> 1 & 1); + byteData[rowId + 2] = (byte)(src >>> 2 & 1); + byteData[rowId + 3] = (byte)(src >>> 3 & 1); + byteData[rowId + 4] = (byte)(src >>> 4 & 1); + byteData[rowId + 5] = (byte)(src >>> 5 & 1); + byteData[rowId + 6] = (byte)(src >>> 6 & 1); + byteData[rowId + 7] = (byte)(src >>> 7 & 1); + } + @Override public boolean getBoolean(int rowId) { return byteData[rowId] == 1; diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java index 8f7dcf237440a..5e01c372793f1 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java @@ -46,6 +46,7 @@ * WritableColumnVector are intended to be reused. */ public abstract class WritableColumnVector extends ColumnVector { + private final byte[] byte8 = new byte[8]; /** * Resets this column for writing. The currently stored values are no longer accessible. @@ -201,6 +202,29 @@ public WritableColumnVector reserveDictionaryIds(int capacity) { */ public abstract void putBooleans(int rowId, int count, boolean value); + /** + * Sets bits from [src[srcIndex], src[srcIndex + count]) to [rowId, rowId + count) + * src must contain bit-packed 8 booleans in the byte. + */ + public void putBooleans(int rowId, int count, byte src, int srcIndex) { + assert ((srcIndex + count) <= 8); + byte8[0] = (byte)(src & 1); + byte8[1] = (byte)(src >>> 1 & 1); + byte8[2] = (byte)(src >>> 2 & 1); + byte8[3] = (byte)(src >>> 3 & 1); + byte8[4] = (byte)(src >>> 4 & 1); + byte8[5] = (byte)(src >>> 5 & 1); + byte8[6] = (byte)(src >>> 6 & 1); + byte8[7] = (byte)(src >>> 7 & 1); + putBytes(rowId, count, byte8, srcIndex); + } + + /** + * Sets bits from [src[0], src[7]] to [rowId, rowId + 7] + * src must contain bit-packed 8 booleans in the byte. + */ + public abstract void putBooleans(int rowId, byte src); + /** * Sets `value` to the value at rowId. */ @@ -470,6 +494,18 @@ public final int appendBooleans(int count, boolean v) { return result; } + /** + * Append bits from [src[offset], src[offset + count]) + * src must contain bit-packed 8 booleans in the byte. + */ + public final int appendBooleans(int count, byte src, int offset) { + reserve(elementsAppended + count); + int result = elementsAppended; + putBooleans(elementsAppended, count, src, offset); + elementsAppended += count; + return result; + } + public final int appendByte(byte v) { reserve(elementsAppended + 1); putByte(elementsAppended, v); diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala index 0fc43c7052d06..0e9e9a7060276 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala @@ -119,31 +119,36 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { prepareTable(dir, spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM t1")) + val query = dataType match { + case BooleanType => "sum(cast(id as bigint))" + case _ => "sum(id)" + } + sqlBenchmark.addCase("SQL CSV") { _ => - spark.sql("select sum(id) from csvTable").noop() + spark.sql(s"select $query from csvTable").noop() } sqlBenchmark.addCase("SQL Json") { _ => - spark.sql("select sum(id) from jsonTable").noop() + spark.sql(s"select $query from jsonTable").noop() } sqlBenchmark.addCase("SQL Parquet Vectorized") { _ => - spark.sql("select sum(id) from parquetTable").noop() + spark.sql(s"select $query from parquetTable").noop() } sqlBenchmark.addCase("SQL Parquet MR") { _ => withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(id) from parquetTable").noop() + spark.sql(s"select $query from parquetTable").noop() } } sqlBenchmark.addCase("SQL ORC Vectorized") { _ => - spark.sql("SELECT sum(id) FROM orcTable").noop() + spark.sql(s"SELECT $query FROM orcTable").noop() } sqlBenchmark.addCase("SQL ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("SELECT sum(id) FROM orcTable").noop() + spark.sql(s"SELECT $query FROM orcTable").noop() } } @@ -157,6 +162,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { var longSum = 0L var doubleSum = 0.0 val aggregateValue: (ColumnVector, Int) => Unit = dataType match { + case BooleanType => (col: ColumnVector, i: Int) => if (col.getBoolean(i)) longSum += 1L case ByteType => (col: ColumnVector, i: Int) => longSum += col.getByte(i) case ShortType => (col: ColumnVector, i: Int) => longSum += col.getShort(i) case IntegerType => (col: ColumnVector, i: Int) => longSum += col.getInt(i) @@ -191,6 +197,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { var longSum = 0L var doubleSum = 0.0 val aggregateValue: (InternalRow) => Unit = dataType match { + case BooleanType => (col: InternalRow) => if (col.getBoolean(0)) longSum += 1L case ByteType => (col: InternalRow) => longSum += col.getByte(0) case ShortType => (col: InternalRow) => longSum += col.getShort(0) case IntegerType => (col: InternalRow) => longSum += col.getInt(0) @@ -542,7 +549,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { runBenchmark("SQL Single Numeric Column Scan") { - Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { + Seq(BooleanType, ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { dataType => numericScanBenchmark(1024 * 1024 * 15, dataType) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala index 2317a4d00e069..79b8c9e2c571e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala @@ -29,14 +29,15 @@ import org.apache.spark.sql.test.SharedSparkSession class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSession { import testImplicits._ - val ROW = ((1).toByte, 2, 3L, "abc", Period.of(1, 1, 0), Duration.ofMillis(100)) + val ROW = ((1).toByte, 2, 3L, "abc", Period.of(1, 1, 0), Duration.ofMillis(100), true) val NULL_ROW = ( null.asInstanceOf[java.lang.Byte], null.asInstanceOf[Integer], null.asInstanceOf[java.lang.Long], null.asInstanceOf[String], null.asInstanceOf[Period], - null.asInstanceOf[Duration]) + null.asInstanceOf[Duration], + null.asInstanceOf[java.lang.Boolean]) test("All Types Dictionary") { (1 :: 1000 :: Nil).foreach { n => { @@ -59,6 +60,7 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess assert(batch.column(3).getUTF8String(i).toString == "abc") assert(batch.column(4).getInt(i) == 13) assert(batch.column(5).getLong(i) == 100000) + assert(batch.column(6).getBoolean(i) == true) i += 1 } reader.close() @@ -88,6 +90,7 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess assert(batch.column(3).isNullAt(i)) assert(batch.column(4).isNullAt(i)) assert(batch.column(5).isNullAt(i)) + assert(batch.column(6).isNullAt(i)) i += 1 } reader.close() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala index 0477b41942d4b..738f2281c9a65 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala @@ -25,9 +25,11 @@ import java.util.NoSuchElementException import scala.collection.JavaConverters._ import scala.collection.mutable +import scala.language.implicitConversions import scala.util.Random import org.apache.arrow.vector.IntVector +import org.apache.parquet.bytes.ByteBufferInputStream import org.apache.spark.SparkFunSuite import org.apache.spark.memory.MemoryMode @@ -36,6 +38,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapBuilder, DateTimeUtils, GenericArrayData, MapData} import org.apache.spark.sql.execution.RowToColumnConverter +import org.apache.spark.sql.execution.datasources.parquet.VectorizedPlainValuesReader import org.apache.spark.sql.types._ import org.apache.spark.sql.util.ArrowUtils import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnarBatchRow, ColumnVector} @@ -130,6 +133,97 @@ class ColumnarBatchSuite extends SparkFunSuite { } } + testVector("Boolean APIs", 1024, BooleanType) { + column => + val reference = mutable.ArrayBuffer.empty[Boolean] + + var values = Array(true, false, true, false, false) + var bits = values.foldRight(0)((b, i) => i << 1 | (if (b) 1 else 0)).toByte + column.appendBooleans(2, bits, 0) + reference ++= values.slice(0, 2) + + column.appendBooleans(3, bits, 2) + reference ++= values.slice(2, 5) + + column.appendBooleans(6, true) + reference ++= Array.fill(6)(true) + + column.appendBoolean(false) + reference += false + + var idx = column.elementsAppended + + values = Array(true, true, false, true, false, true, false, true) + bits = values.foldRight(0)((b, i) => i << 1 | (if (b) 1 else 0)).toByte + column.putBooleans(idx, 2, bits, 0) + reference ++= values.slice(0, 2) + idx += 2 + + column.putBooleans(idx, 3, bits, 2) + reference ++= values.slice(2, 5) + idx += 3 + + column.putBooleans(idx, bits) + reference ++= values + idx += 8 + + column.putBoolean(idx, false) + reference += false + idx += 1 + + column.putBooleans(idx, 3, true) + reference ++= Array.fill(3)(true) + idx += 3 + + implicit def intToByte(i: Int): Byte = i.toByte + val buf = ByteBuffer.wrap(Array(0x33, 0x5A, 0xA5, 0xCC, 0x0F, 0xF0, 0xEE, 0x77, 0x88)) + val reader = new VectorizedPlainValuesReader() + reader.initFromPage(0, ByteBufferInputStream.wrap(buf)) + + reader.skipBooleans(1) // bit index 0 + + column.putBoolean(idx, reader.readBoolean) // bit index 1 + reference += true + idx += 1 + + column.putBoolean(idx, reader.readBoolean) // bit index 2 + reference += false + idx += 1 + + reader.skipBooleans(5) // bit index [3, 7] + + column.putBoolean(idx, reader.readBoolean) // bit index 8 + reference += false + idx += 1 + + reader.skipBooleans(8) // bit index [9, 16] + reader.skipBooleans(0) // no-op + + column.putBoolean(idx, reader.readBoolean) // bit index 17 + reference += false + idx += 1 + + reader.skipBooleans(16) // bit index [18, 33] + + reader.readBooleans(4, column, idx) // bit index [34, 37] + reference ++= Array(true, true, false, false) + idx += 4 + + reader.readBooleans(11, column, idx) // bit index [38, 48] + reference ++= Array(false, false, false, false, false, false, true, true, true, true, false) + idx += 11 + + reader.skipBooleans(7) // bit index [49, 55] + + reader.readBooleans(9, column, idx) // bit index [56, 64] + reference ++= Array(true, true, true, false, true, true, true, false, false) + idx += 9 + + reference.zipWithIndex.foreach { v => + assert(v._1 == column.getBoolean(v._2), "VectorType=" + column.getClass.getSimpleName) + } + } + testVector("Byte APIs", 1024, ByteType) { column => val reference = mutable.ArrayBuffer.empty[Byte]