From 13ab6ba969e2740953fe3797c79b02f71342de9e Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Wed, 6 May 2026 01:55:31 +0000 Subject: [PATCH 1/2] feat(btree): refactor GlobalIndexScan & add inte test for btree global index --- .../paimon/global_index/global_index_reader.h | 10 +- .../paimon/global_index/global_index_result.h | 10 +- .../paimon/global_index/global_index_scan.h | 75 +- .../row_range_global_index_scanner.h | 60 - include/paimon/scan_context.h | 11 +- src/paimon/CMakeLists.txt | 5 +- src/paimon/common/global_index/CMakeLists.txt | 4 +- .../btree/btree_file_meta_selector_test.cpp | 2 +- .../btree/btree_global_index_writer.cpp | 5 +- .../btree/btree_global_indexer.cpp | 10 +- .../btree/lazy_filtered_btree_reader.cpp | 59 +- .../btree/lazy_filtered_btree_reader.h | 2 + .../offset_global_index_reader.cpp | 151 ++ .../global_index/offset_global_index_reader.h | 77 + .../offset_global_index_reader_test.cpp | 311 ++++ .../range_bitmap_global_index_test.cpp | 1 + .../union_global_index_reader.cpp | 207 +++ .../global_index/union_global_index_reader.h | 85 + .../union_global_index_reader_test.cpp | 460 ++++++ src/paimon/common/sst/sst_file_reader.cpp | 2 +- src/paimon/common/utils/row_range_index.cpp | 2 +- .../append/append_compact_coordinator.cpp | 3 +- .../global_index/global_index_evaluator.h | 19 +- .../global_index_evaluator_impl.cpp | 78 +- .../global_index_evaluator_impl.h | 28 +- .../core/global_index/global_index_scan.cpp | 10 +- .../global_index/global_index_scan_impl.cpp | 290 ++-- .../global_index/global_index_scan_impl.h | 62 +- .../global_index/global_index_write_task.cpp | 7 +- .../row_range_global_index_scanner_impl.cpp | 138 -- .../row_range_global_index_scanner_impl.h | 76 - .../operation/abstract_file_store_write.cpp | 3 +- .../data_evolution_file_store_scan.cpp | 52 +- .../data_evolution_file_store_scan.h | 7 +- .../data_evolution_file_store_scan_test.cpp | 68 +- .../core/operation/file_store_commit_impl.cpp | 10 +- src/paimon/core/operation/file_store_scan.cpp | 30 +- src/paimon/core/operation/file_store_scan.h | 14 +- .../key_value_file_store_scan_test.cpp | 42 +- src/paimon/core/operation/scan_context.cpp | 10 +- .../core/operation/scan_context_test.cpp | 11 - .../source/data_evolution_batch_scan.cpp | 67 +- .../table/source/data_evolution_batch_scan.h | 4 +- .../core/table/source/data_table_batch_scan.h | 4 +- .../table/source/snapshot/snapshot_reader.h | 4 +- src/paimon/core/table/source/table_scan.cpp | 3 +- test/inte/global_index_test.cpp | 1465 +++++++++++------ .../append_with_btree_with_partition/README | 27 + ...0323e05c-8903-49a2-9bc0-0a7e5a395211.index | Bin 0 -> 229 bytes ...061cad82-2fc6-448e-a3ce-7ac49c836273.index | Bin 0 -> 176 bytes ...1c6c0c2a-b8f0-410e-90bf-26aadf8472af.index | Bin 0 -> 229 bytes ...1cf3190c-b80f-46b9-bce3-0566612a2e3c.index | Bin 0 -> 180 bytes ...2525fcef-a890-403a-a587-c9d63329adc6.index | Bin 0 -> 229 bytes ...2d75bcb5-1f7d-40b6-9e13-ed11ac16cbb2.index | Bin 0 -> 176 bytes ...3311dd84-93f0-4b16-b3eb-54864d44767b.index | Bin 0 -> 229 bytes ...476c39a6-b760-45db-9602-a1e6559cfabe.index | Bin 0 -> 180 bytes ...4c089e9d-f607-41dc-8edc-41e10a951eac.index | Bin 0 -> 89 bytes ...4ed7238f-dd41-4ab0-ad3a-816a8f26aba3.index | Bin 0 -> 229 bytes ...6eb46ccf-94b1-4ba2-8704-9ce827de9c5b.index | Bin 0 -> 189 bytes ...77eb6c57-c860-47d1-83a9-ec93886e76c9.index | Bin 0 -> 176 bytes ...7a7e9396-772a-40c2-811c-ddc1f716e1d9.index | Bin 0 -> 189 bytes ...84feda98-48d8-40e1-84a1-5c5b737ef5e5.index | Bin 0 -> 144 bytes ...8a91544c-8779-40bf-a66c-4f84f576a6ec.index | Bin 0 -> 229 bytes ...8b658fde-dc52-4948-8733-7a6b9e2aa58e.index | Bin 0 -> 180 bytes ...8b964514-2199-42d4-b2c2-625b461f82f3.index | Bin 0 -> 242 bytes ...8dbf5e09-7023-4f97-8bbb-248d3eec6bed.index | Bin 0 -> 105 bytes ...97cac122-2d88-4b84-88aa-efe022506163.index | Bin 0 -> 144 bytes ...9f0d6ea2-e3d9-4877-9c57-85c285a2cd9d.index | Bin 0 -> 144 bytes ...ae2b3e23-a92a-4688-b6f9-b35117567156.index | Bin 0 -> 105 bytes ...af76d02b-7f3a-4f6b-b45b-db44c5ff24cc.index | Bin 0 -> 180 bytes ...bd1a00c8-ff45-426d-af34-f7080ed247ba.index | Bin 0 -> 89 bytes ...befa46d1-beb2-4a58-912b-2318efd2105c.index | Bin 0 -> 180 bytes ...c146d9cb-5e43-4bf8-891b-a05f92cd0563.index | Bin 0 -> 229 bytes ...d8503faa-e1d3-474e-a1f1-2e380dec91d3.index | Bin 0 -> 180 bytes ...d99a76c1-c181-49d8-8882-812700de4013.index | Bin 0 -> 242 bytes ...da0c9c05-4f11-4e9a-8246-01582ee5e614.index | Bin 0 -> 144 bytes ...e81874df-120e-4124-9517-b8b7fbefc336.index | Bin 0 -> 180 bytes ...f35fe6ce-b072-4fd6-a91c-a5d940ae5998.index | Bin 0 -> 229 bytes ...f530db7c-32f7-4f35-b5d2-4b7f321488c4.index | Bin 0 -> 180 bytes ...f67895be-6638-449c-9a1b-f2ac7d0efc7d.index | Bin 0 -> 176 bytes ...est-efa3dce8-fe86-499a-b436-182337edd173-0 | Bin 0 -> 2407 bytes ...est-804b07d3-0855-4246-b778-8bb12a5154dc-0 | Bin 0 -> 2162 bytes ...est-d67632fc-11f7-484d-8634-5d4ae78743e5-0 | Bin 0 -> 2153 bytes ...ist-531b8b9e-f8b9-49a1-b2b1-7c131df48686-0 | Bin 0 -> 1006 bytes ...ist-531b8b9e-f8b9-49a1-b2b1-7c131df48686-1 | Bin 0 -> 1113 bytes ...ist-81c12cf5-7bf5-45a1-8414-233e9370d2d5-0 | Bin 0 -> 1151 bytes ...ist-81c12cf5-7bf5-45a1-8414-233e9370d2d5-1 | Bin 0 -> 1006 bytes ...ist-bde5901d-912f-40ca-a9c5-fd9d679b4924-0 | Bin 0 -> 1113 bytes ...ist-bde5901d-912f-40ca-a9c5-fd9d679b4924-1 | Bin 0 -> 1119 bytes ...5888-3dd7-4cc9-95f9-9a6cc88e88cf-0.parquet | Bin 0 -> 3411 bytes ...1641-4057-4e0d-a2d4-91a7842d2acb-0.parquet | Bin 0 -> 3412 bytes .../schema/schema-0 | 50 + .../snapshot/EARLIEST | 1 + .../snapshot/LATEST | 1 + .../snapshot/snapshot-1 | 16 + .../snapshot/snapshot-2 | 16 + .../snapshot/snapshot-3 | 17 + .../append_with_btree_with_partition/README | 27 + ...03d348f1-3a2b-47b2-ab26-1d894a5bc150.index | Bin 0 -> 180 bytes ...03da8b2e-7eb5-42c7-9741-6cfa57e3e305.index | Bin 0 -> 176 bytes ...0b4db5e0-03a9-4fa4-ba96-0288a0eb0b19.index | Bin 0 -> 180 bytes ...10d1c370-e5ad-4530-9935-84e018b7cf8f.index | Bin 0 -> 229 bytes ...226a2318-32a7-4569-88b1-096c4029e4de.index | Bin 0 -> 105 bytes ...336a671f-39fd-4922-b322-ec854712e573.index | Bin 0 -> 180 bytes ...34391e56-cb42-4e31-9002-8be9162b9d77.index | Bin 0 -> 144 bytes ...36d11008-5e75-4801-82da-f5e7c7bc62e0.index | Bin 0 -> 105 bytes ...5279021b-ac4b-4418-9e56-5e8b3ede837b.index | Bin 0 -> 89 bytes ...5d9fd6d0-68e9-46c6-b064-04434bc36ad0.index | Bin 0 -> 180 bytes ...66e10354-4845-43e2-9838-3ceb058fe2c7.index | Bin 0 -> 176 bytes ...6e579175-580a-4b14-ad1c-c3972407503c.index | Bin 0 -> 180 bytes ...6fc58c7a-d306-4da1-a33e-22dfee9390e9.index | Bin 0 -> 229 bytes ...77874487-0b24-461b-93ef-46f04280466a.index | Bin 0 -> 144 bytes ...7a81ec36-7dad-4a49-a5c8-62fdc3eb9064.index | Bin 0 -> 189 bytes ...86da803a-8554-4201-934c-0a8edf06bb94.index | Bin 0 -> 229 bytes ...8be14989-0b16-44d6-aedd-0556af02c4b5.index | Bin 0 -> 180 bytes ...93f35d27-0323-42d2-9a03-651f4d75b4a6.index | Bin 0 -> 144 bytes ...ac97eb16-ec2e-4a32-8366-939956cc32df.index | Bin 0 -> 229 bytes ...c02c2b4e-e9b1-437e-82b2-9354c4f2ecfb.index | Bin 0 -> 189 bytes ...c54baf11-8caf-47b4-9f04-e6e70aa64211.index | Bin 0 -> 229 bytes ...c8749a02-24af-4188-8e28-a080b56e79f0.index | Bin 0 -> 180 bytes ...cc3a2bf4-5a19-46d1-8634-e7e1256026e8.index | Bin 0 -> 180 bytes ...d02fec1b-f5eb-4890-a7eb-740cbc385f84.index | Bin 0 -> 242 bytes ...d535d249-542f-4bb1-9a69-dc957176497b.index | Bin 0 -> 89 bytes ...dc4937c8-cdc6-4447-8cc2-40ebebf338d7.index | Bin 0 -> 242 bytes ...dd37e0dd-c106-48bd-8ba0-4315b38b2eed.index | Bin 0 -> 176 bytes ...e2c03e02-aa1c-46d0-b779-4c9d203c7d73.index | Bin 0 -> 229 bytes ...f91d5405-4116-4a89-acc6-4e4c028410e5.index | Bin 0 -> 176 bytes ...fcfd9ca5-fa06-422e-89a6-f33ea60630eb.index | Bin 0 -> 229 bytes ...fead2db1-5481-4792-b20c-d978dfc00de5.index | Bin 0 -> 144 bytes ...ff65668f-e4d2-4e69-ab5a-79a777f51650.index | Bin 0 -> 229 bytes ...est-7b331606-88e1-4bdd-8426-8c6b88103b42-0 | Bin 0 -> 2405 bytes ...est-b2c71b39-9e84-4784-8441-a367e6aebaeb-0 | Bin 0 -> 2163 bytes ...est-ce023896-9468-49d9-8c55-e26271ff81da-0 | Bin 0 -> 2158 bytes ...ist-28b8fffb-abfc-4f8c-b366-99f534b294eb-0 | Bin 0 -> 1113 bytes ...ist-28b8fffb-abfc-4f8c-b366-99f534b294eb-1 | Bin 0 -> 1119 bytes ...ist-349b7730-d60f-4562-b106-5ef4f6a99a41-0 | Bin 0 -> 1006 bytes ...ist-349b7730-d60f-4562-b106-5ef4f6a99a41-1 | Bin 0 -> 1113 bytes ...ist-8b3083fa-2817-477f-beb5-f59f058015b1-0 | Bin 0 -> 1153 bytes ...ist-8b3083fa-2817-477f-beb5-f59f058015b1-1 | Bin 0 -> 1006 bytes ...1b0d-4fc5-410a-9157-2dd1cf34a15e-0.parquet | Bin 0 -> 3411 bytes ...cbef-417f-47b0-83ee-4145bc3dedb7-0.parquet | Bin 0 -> 3412 bytes .../schema/schema-0 | 50 + .../snapshot/EARLIEST | 1 + .../snapshot/LATEST | 1 + .../snapshot/snapshot-1 | 16 + .../snapshot/snapshot-2 | 16 + .../snapshot/snapshot-3 | 17 + 147 files changed, 2916 insertions(+), 1394 deletions(-) delete mode 100644 include/paimon/global_index/row_range_global_index_scanner.h create mode 100644 src/paimon/common/global_index/offset_global_index_reader.cpp create mode 100644 src/paimon/common/global_index/offset_global_index_reader.h create mode 100644 src/paimon/common/global_index/offset_global_index_reader_test.cpp create mode 100644 src/paimon/common/global_index/union_global_index_reader.cpp create mode 100644 src/paimon/common/global_index/union_global_index_reader.h create mode 100644 src/paimon/common/global_index/union_global_index_reader_test.cpp delete mode 100644 src/paimon/core/global_index/row_range_global_index_scanner_impl.cpp delete mode 100644 src/paimon/core/global_index/row_range_global_index_scanner_impl.h create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/README create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-0323e05c-8903-49a2-9bc0-0a7e5a395211.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-061cad82-2fc6-448e-a3ce-7ac49c836273.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-1c6c0c2a-b8f0-410e-90bf-26aadf8472af.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-1cf3190c-b80f-46b9-bce3-0566612a2e3c.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-2525fcef-a890-403a-a587-c9d63329adc6.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-2d75bcb5-1f7d-40b6-9e13-ed11ac16cbb2.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-3311dd84-93f0-4b16-b3eb-54864d44767b.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-476c39a6-b760-45db-9602-a1e6559cfabe.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-4c089e9d-f607-41dc-8edc-41e10a951eac.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-4ed7238f-dd41-4ab0-ad3a-816a8f26aba3.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6eb46ccf-94b1-4ba2-8704-9ce827de9c5b.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-77eb6c57-c860-47d1-83a9-ec93886e76c9.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-7a7e9396-772a-40c2-811c-ddc1f716e1d9.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-84feda98-48d8-40e1-84a1-5c5b737ef5e5.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8a91544c-8779-40bf-a66c-4f84f576a6ec.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8b658fde-dc52-4948-8733-7a6b9e2aa58e.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8b964514-2199-42d4-b2c2-625b461f82f3.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8dbf5e09-7023-4f97-8bbb-248d3eec6bed.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-97cac122-2d88-4b84-88aa-efe022506163.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-9f0d6ea2-e3d9-4877-9c57-85c285a2cd9d.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ae2b3e23-a92a-4688-b6f9-b35117567156.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-af76d02b-7f3a-4f6b-b45b-db44c5ff24cc.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-bd1a00c8-ff45-426d-af34-f7080ed247ba.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-befa46d1-beb2-4a58-912b-2318efd2105c.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c146d9cb-5e43-4bf8-891b-a05f92cd0563.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d8503faa-e1d3-474e-a1f1-2e380dec91d3.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d99a76c1-c181-49d8-8882-812700de4013.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-da0c9c05-4f11-4e9a-8246-01582ee5e614.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-e81874df-120e-4124-9517-b8b7fbefc336.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f35fe6ce-b072-4fd6-a91c-a5d940ae5998.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f530db7c-32f7-4f35-b5d2-4b7f321488c4.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f67895be-6638-449c-9a1b-f2ac7d0efc7d.index create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/index-manifest-efa3dce8-fe86-499a-b436-182337edd173-0 create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-804b07d3-0855-4246-b778-8bb12a5154dc-0 create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-d67632fc-11f7-484d-8634-5d4ae78743e5-0 create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-0 create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-1 create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-0 create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-1 create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-bde5901d-912f-40ca-a9c5-fd9d679b4924-0 create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-bde5901d-912f-40ca-a9c5-fd9d679b4924-1 create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=0/bucket-0/data-0f1a5888-3dd7-4cc9-95f9-9a6cc88e88cf-0.parquet create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=1/bucket-0/data-38231641-4057-4e0d-a2d4-91a7842d2acb-0.parquet create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 create mode 100644 test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/README create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-03d348f1-3a2b-47b2-ab26-1d894a5bc150.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-03da8b2e-7eb5-42c7-9741-6cfa57e3e305.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-0b4db5e0-03a9-4fa4-ba96-0288a0eb0b19.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-10d1c370-e5ad-4530-9935-84e018b7cf8f.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-226a2318-32a7-4569-88b1-096c4029e4de.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-336a671f-39fd-4922-b322-ec854712e573.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-34391e56-cb42-4e31-9002-8be9162b9d77.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-36d11008-5e75-4801-82da-f5e7c7bc62e0.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-5279021b-ac4b-4418-9e56-5e8b3ede837b.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-5d9fd6d0-68e9-46c6-b064-04434bc36ad0.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-66e10354-4845-43e2-9838-3ceb058fe2c7.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6e579175-580a-4b14-ad1c-c3972407503c.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6fc58c7a-d306-4da1-a33e-22dfee9390e9.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-77874487-0b24-461b-93ef-46f04280466a.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-7a81ec36-7dad-4a49-a5c8-62fdc3eb9064.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-86da803a-8554-4201-934c-0a8edf06bb94.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8be14989-0b16-44d6-aedd-0556af02c4b5.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-93f35d27-0323-42d2-9a03-651f4d75b4a6.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ac97eb16-ec2e-4a32-8366-939956cc32df.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c02c2b4e-e9b1-437e-82b2-9354c4f2ecfb.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c54baf11-8caf-47b4-9f04-e6e70aa64211.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c8749a02-24af-4188-8e28-a080b56e79f0.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-cc3a2bf4-5a19-46d1-8634-e7e1256026e8.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d02fec1b-f5eb-4890-a7eb-740cbc385f84.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d535d249-542f-4bb1-9a69-dc957176497b.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-dc4937c8-cdc6-4447-8cc2-40ebebf338d7.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-dd37e0dd-c106-48bd-8ba0-4315b38b2eed.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-e2c03e02-aa1c-46d0-b779-4c9d203c7d73.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f91d5405-4116-4a89-acc6-4e4c028410e5.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-fcfd9ca5-fa06-422e-89a6-f33ea60630eb.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-fead2db1-5481-4792-b20c-d978dfc00de5.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ff65668f-e4d2-4e69-ab5a-79a777f51650.index create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/index-manifest-7b331606-88e1-4bdd-8426-8c6b88103b42-0 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-b2c71b39-9e84-4784-8441-a367e6aebaeb-0 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-ce023896-9468-49d9-8c55-e26271ff81da-0 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-0 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-1 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-0 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-1 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-8b3083fa-2817-477f-beb5-f59f058015b1-0 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-8b3083fa-2817-477f-beb5-f59f058015b1-1 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=0/bucket-0/data-b6d91b0d-4fc5-410a-9157-2dd1cf34a15e-0.parquet create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=1/bucket-0/data-6ab3cbef-417f-47b0-83ee-4145bc3dedb7-0.parquet create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 create mode 100644 test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 diff --git a/include/paimon/global_index/global_index_reader.h b/include/paimon/global_index/global_index_reader.h index 9325735d2..6f76780e6 100644 --- a/include/paimon/global_index/global_index_reader.h +++ b/include/paimon/global_index/global_index_reader.h @@ -31,11 +31,13 @@ namespace paimon { /// /// Derived classes are expected to implement the visitor methods (e.g., `VisitEqual`, /// `VisitIsNull`, etc.) to return index-based results that indicate which -/// row satisfy the given predicate. +/// rows satisfy the given predicate. /// -/// @note All `GlobalIndexResult` objects returned by implementations of this class use **local row -/// ids** that start from 0 — not global row ids in the entire table. -/// The `GlobalIndexResult` can be converted to global row ids by calling `AddOffset()`. +/// @note Leaf implementations of `GlobalIndexReader` (e.g., +/// `BTreeGlobalIndexReader`, `BitmapGlobalIndexReader`) return `GlobalIndexResult` +/// objects containing those **local** row ids. Conversion to **global** row ids is +/// performed by wrapping the leaf reader with `OffsetGlobalIndexReader`, which adds +/// `GlobalIndexMeta.row_range_start` to every row id via `GlobalIndexResult::AddOffset()`. class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor> { public: /// VisitVectorSearch performs approximate vector similarity search. diff --git a/include/paimon/global_index/global_index_result.h b/include/paimon/global_index/global_index_result.h index bbde2aa0e..bff85ac6c 100644 --- a/include/paimon/global_index/global_index_result.h +++ b/include/paimon/global_index/global_index_result.h @@ -27,12 +27,12 @@ #include "paimon/visibility.h" namespace paimon { -/// Global index result to get selected global row ids. +/// Global index result that holds the row ids selected by a predicate. class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this { public: virtual ~GlobalIndexResult() = default; - /// Iterator interface for traversing selected global row ids. + /// Iterator interface for traversing selected row ids. class Iterator { public: virtual ~Iterator() = default; @@ -40,7 +40,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this IsEmpty() const = 0; - /// Creates a new iterator over the selected global row ids. + /// Creates a new iterator over the selected row ids. virtual Result> CreateIterator() const = 0; /// Returns non-overlapping, sorted ranges covering all row ids in `GlobalIndexResult`. @@ -125,7 +125,7 @@ class PAIMON_EXPORT ScoredGlobalIndexResult : public GlobalIndexResult { /// Retrieves the next (row_id, score) pair and advances the iterator. /// /// @return A pair where: - /// - first: the global row id (returned in ascending order), + /// - first: the row id (returned in ascending order). /// - second: the associated score computed by the index. /// /// @note The sequence is ordered by **row_id**, not by score. diff --git a/include/paimon/global_index/global_index_scan.h b/include/paimon/global_index/global_index_scan.h index b24c697b2..205a5ca6a 100644 --- a/include/paimon/global_index/global_index_scan.h +++ b/include/paimon/global_index/global_index_scan.h @@ -23,39 +23,52 @@ #include #include -#include "paimon/global_index/row_range_global_index_scanner.h" +#include "paimon/global_index/global_index_reader.h" +#include "paimon/global_index/global_index_result.h" +#include "paimon/predicate/predicate.h" #include "paimon/utils/range.h" +#include "paimon/utils/row_range_index.h" #include "paimon/visibility.h" - namespace paimon { class MemoryPool; class FileSystem; + /// Represents a logical scan over a global index for a table. class PAIMON_EXPORT GlobalIndexScan { public: /// Creates a `GlobalIndexScan` instance for the specified table and context. - /// /// @param table_path Root directory of the table. /// @param snapshot_id Optional snapshot id to read from; if not provided, uses the latest. /// @param partitions Optional list of specific partitions to restrict the scan scope. /// Each map represents one partition (e.g., {"dt": "2024-06-01"}). - /// If omitted, scans all partitions. - /// @param options Index-specific configuration. + /// If omitted (`std::nullopt`), scans all partitions of the table. + /// @param options User defined configuration. /// @param file_system File system for accessing index files. /// If not provided (nullptr), it is inferred from the `FILE_SYSTEM` /// key in the `options` parameter. /// @param pool Memory pool for temporary allocations; if nullptr, uses default. /// @return A `Result` containing a unique pointer to the created scanner, - /// or an error if initialization fails (e.g., I/O error). + /// or an error if initialization fails (e.g., I/O error, invalid snapshot id, + /// unknown partition). static Result> Create( const std::string& table_path, const std::optional& snapshot_id, const std::optional>>& partitions, const std::map& options, const std::shared_ptr& file_system, const std::shared_ptr& pool); - /// Creates a `GlobalIndexScan` instance for the specified table and context. - /// - /// @param partition_filters Optional specific partition predicates. + /// Creates a `GlobalIndexScan` instance for the specified table and context, with a + /// predicate-based partition filter. + /// @param root_path Root directory of the table. + /// @param snapshot_id Optional snapshot id to read from; if not provided, uses the + /// latest snapshot. + /// @param partition_filters Optional partition-level predicate used for partition pruning. + /// If nullptr, all partitions are scanned. + /// @param options User defined configuration. + /// @param file_system File system for accessing index files. If nullptr, it is + /// inferred from the `FILE_SYSTEM` key in `options`. + /// @param memory_pool Memory pool for temporary allocations; if nullptr, uses default. + /// @return A `Result` containing a unique pointer to the created scanner, + /// or an error if initialization fails. static Result> Create( const std::string& root_path, const std::optional& snapshot_id, const std::shared_ptr& partition_filters, @@ -65,28 +78,30 @@ class PAIMON_EXPORT GlobalIndexScan { virtual ~GlobalIndexScan() = default; - /// Creates a scanner for the global index over the specified row id range. - /// - /// This method instantiates a low-level scanner that can evaluate predicates and - /// retrieve matching row ids from the global index data corresponding to the given - /// row id range. - /// - /// @param range The inclusive row id range [start, end] for which to create the scanner. - /// The range must be fully covered by existing global index data (from - /// `GetRowRangeList()`). - /// @return A `Result` containing a range-level scanner, or an error if parse index meta fails. - virtual Result> CreateRangeScan( - const Range& range) = 0; + /// Creates several `GlobalIndexReader`s for a specific field. + /// @param field_name Name of the indexed column. + /// @param row_range_index Optional row range that limits the scan to a sub-range of row ids. + /// If not provided, the entire row range is considered. + /// @return A `Result` that is: + /// - Successful with several readers if the indexes exist and load correctly; + /// - Successful with an empty vector if no index was built for the given field; + /// - Error returns when loading fails (e.g., file corruption, I/O error, + /// unsupported format). + virtual Result>> CreateReaders( + const std::string& field_name, + const std::optional& row_range_index) const = 0; - /// Returns row id ranges covered by this global index (sorted and non-overlapping - /// ranges). - /// - /// Each `Range` represents a contiguous segment of row ids for which global index - /// data exists. This allows the query engine to parallelize scanning and be aware - /// of ranges that are not covered by any global index. - /// - /// @return A `Result` containing sorted and non-overlapping `Range` objects. - virtual Result> GetRowRangeList() = 0; + /// Creates several `GlobalIndexReader`s for a specific field (looked up by id), + /// @param field_id Field id of the indexed column. + /// @param row_range_index Optional row range that limits the scan to a sub-range of row ids. + /// If not provided, the entire row range is considered. + /// @return A `Result` that is: + /// - Successful with several readers if the indexes exist and load correctly; + /// - Successful with an empty vector if no index was built for the given field; + /// - Error returns when loading fails (e.g., file corruption, I/O error, + /// unsupported format). + virtual Result>> CreateReaders( + int32_t field_id, const std::optional& row_range_index) const = 0; }; } // namespace paimon diff --git a/include/paimon/global_index/row_range_global_index_scanner.h b/include/paimon/global_index/row_range_global_index_scanner.h deleted file mode 100644 index 996b2c2e7..000000000 --- a/include/paimon/global_index/row_range_global_index_scanner.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include "paimon/global_index/global_index_reader.h" -#include "paimon/visibility.h" - -namespace paimon { -/// Interface for scanning global index data at the range level. -class PAIMON_EXPORT RowRangeGlobalIndexScanner { - public: - virtual ~RowRangeGlobalIndexScanner() = default; - - /// Creates a `GlobalIndexReader` for a specific field and index type within this range. - /// - /// This reader provides low-level access to the serialized index data - /// for the given column (`field_name`) and index kind (`index_type`, such as "bitmap"). - /// - /// @param field_name Name of the indexed column. - /// @param index_type Type of the global index (e.g., "bitmap", "lumina"). - /// @return A `Result` that is: - /// - Successful with a non-null reader if the index exists and loads correctly; - /// - Successful with a null pointer if no index was built for the given field and type; - /// - An error only if loading fails (e.g., file corruption, I/O error, unsupported - /// format). - /// @note All `GlobalIndexResult` objects returned by `GlobalIndexReader` use **local row - /// ids** that start from 0 — not global row ids in the entire table. - virtual Result> CreateReader( - const std::string& field_name, const std::string& index_type) const = 0; - - /// Creates several `GlobalIndexReader`s for a specific field within this range. - /// - /// @param field_name Name of the indexed column. - /// @return A `Result` that is: - /// - Successful with several readers if the indexes exist and load correctly; - /// - Successful with an empty vector if no index was built for the given field; - /// - Error returns when loading fails (e.g., file corruption, I/O error, unsupported - /// format). - virtual Result>> CreateReaders( - const std::string& field_name) const = 0; -}; - -} // namespace paimon diff --git a/include/paimon/scan_context.h b/include/paimon/scan_context.h index c40fe62de..ee6b18515 100644 --- a/include/paimon/scan_context.h +++ b/include/paimon/scan_context.h @@ -25,7 +25,6 @@ #include "paimon/global_index/global_index_result.h" #include "paimon/predicate/predicate.h" -#include "paimon/predicate/vector_search.h" #include "paimon/result.h" #include "paimon/type_fwd.h" #include "paimon/visibility.h" @@ -104,19 +103,14 @@ class PAIMON_EXPORT ScanFilter { public: ScanFilter(const std::shared_ptr& predicate, const std::vector>& partition_filters, - const std::optional& bucket_filter, - const std::shared_ptr& vector_search) + const std::optional& bucket_filter) : predicates_(predicate), - vector_search_(vector_search), bucket_filter_(bucket_filter), partition_filters_(partition_filters) {} std::shared_ptr GetPredicate() const { return predicates_; } - std::shared_ptr GetVectorSearch() const { - return vector_search_; - } std::optional GetBucketFilter() const { return bucket_filter_; } @@ -126,7 +120,6 @@ class PAIMON_EXPORT ScanFilter { private: std::shared_ptr predicates_; - std::shared_ptr vector_search_; std::optional bucket_filter_; std::vector> partition_filters_; }; @@ -155,8 +148,6 @@ class PAIMON_EXPORT ScanContextBuilder { ScanContextBuilder& SetGlobalIndexResult( const std::shared_ptr& global_index_result); - /// Set vector search for similarity search. - ScanContextBuilder& SetVectorSearch(const std::shared_ptr& vector_search); /// The options added or set in `ScanContextBuilder` have high priority and will be merged with /// the options in table schema. ScanContextBuilder& AddOption(const std::string& key, const std::string& value); diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index 1301169b5..e314ef3d7 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -53,6 +53,8 @@ set(PAIMON_COMMON_SRCS common/fs/resolving_file_system.cpp common/fs/file_system_factory.cpp common/global_config.cpp + common/global_index/union_global_index_reader.cpp + common/global_index/offset_global_index_reader.cpp common/global_index/complete_index_score_batch_reader.cpp common/global_index/bitmap_scored_global_index_result.cpp common/global_index/bitmap_global_index_result.cpp @@ -195,7 +197,6 @@ set(PAIMON_CORE_SRCS core/global_index/global_index_evaluator_impl.cpp core/global_index/global_index_scan.cpp core/global_index/global_index_scan_impl.cpp - core/global_index/row_range_global_index_scanner_impl.cpp core/global_index/global_index_write_task.cpp core/index/index_file_handler.cpp core/index/global_index_meta.cpp @@ -413,6 +414,8 @@ if(PAIMON_BUILD_TESTS) common/global_index/complete_index_score_batch_reader_test.cpp common/global_index/global_index_result_test.cpp common/global_index/global_index_utils_test.cpp + common/global_index/offset_global_index_reader_test.cpp + common/global_index/union_global_index_reader_test.cpp common/global_index/global_indexer_factory_test.cpp common/global_index/bitmap_global_index_result_test.cpp common/global_index/bitmap_scored_global_index_result_test.cpp diff --git a/src/paimon/common/global_index/CMakeLists.txt b/src/paimon/common/global_index/CMakeLists.txt index c4e7d9548..b86ce0353 100644 --- a/src/paimon/common/global_index/CMakeLists.txt +++ b/src/paimon/common/global_index/CMakeLists.txt @@ -25,7 +25,9 @@ set(PAIMON_GLOBAL_INDEX_SRC btree/lazy_filtered_btree_reader.cpp btree/key_serializer.cpp rangebitmap/range_bitmap_global_index.cpp - rangebitmap/range_bitmap_global_index_factory.cpp) + rangebitmap/range_bitmap_global_index_factory.cpp + offset_global_index_reader.cpp + union_global_index_reader.cpp) add_paimon_lib(paimon_global_index SOURCES diff --git a/src/paimon/common/global_index/btree/btree_file_meta_selector_test.cpp b/src/paimon/common/global_index/btree/btree_file_meta_selector_test.cpp index c24c3c61b..31e45f642 100644 --- a/src/paimon/common/global_index/btree/btree_file_meta_selector_test.cpp +++ b/src/paimon/common/global_index/btree/btree_file_meta_selector_test.cpp @@ -182,7 +182,7 @@ TEST_F(BTreeFileMetaSelectorTest, TestVisitIn) { // 1 in [1,10]=file1, [1,5]=file4 // 2 in [1,10]=file1, [1,5]=file4 // 3 in [1,10]=file1, [1,5]=file4 - // 26 in [21,30]=file3, [19,25]=file5 + // 26 in [21,30]=file3 // 27 in [21,30]=file3 // 28 in [21,30]=file3 ASSERT_OK_AND_ASSIGN(auto result, selector.VisitIn({Literal(1), Literal(2), Literal(3), diff --git a/src/paimon/common/global_index/btree/btree_global_index_writer.cpp b/src/paimon/common/global_index/btree/btree_global_index_writer.cpp index 5eabae3fb..bfb99e8b6 100644 --- a/src/paimon/common/global_index/btree/btree_global_index_writer.cpp +++ b/src/paimon/common/global_index/btree/btree_global_index_writer.cpp @@ -119,7 +119,10 @@ Status BTreeGlobalIndexWriter::Flush() { return Status::OK(); } MemorySliceOutput output(current_row_ids_.size() * 9 + 5, pool_.get()); - PAIMON_RETURN_NOT_OK(output.WriteVarLenInt(current_row_ids_.size())); + if (current_row_ids_.size() > INT32_MAX) { + return Status::Invalid("invalid row id numbers, exceed INT32_MAX"); + } + PAIMON_RETURN_NOT_OK(output.WriteVarLenInt(static_cast(current_row_ids_.size()))); for (int64_t row_id : current_row_ids_) { PAIMON_RETURN_NOT_OK(output.WriteVarLenLong(row_id)); } diff --git a/src/paimon/common/global_index/btree/btree_global_indexer.cpp b/src/paimon/common/global_index/btree/btree_global_indexer.cpp index 0d5ccd3c2..a21edf166 100644 --- a/src/paimon/common/global_index/btree/btree_global_indexer.cpp +++ b/src/paimon/common/global_index/btree/btree_global_indexer.cpp @@ -70,7 +70,10 @@ Result> BTreeGlobalIndexer::CreateWriter( std::string block_size_str, OptionsUtils::GetValueFromMap(options_, BtreeDefs::kBtreeIndexBlockSize, BtreeDefs::kDefaultBtreeIndexBlockSize)); - PAIMON_ASSIGN_OR_RAISE(int32_t block_size, MemorySize::ParseBytes(block_size_str)); + PAIMON_ASSIGN_OR_RAISE(int64_t block_size, MemorySize::ParseBytes(block_size_str)); + if (block_size > INT32_MAX) { + return Status::Invalid("invalid block size, exceed INT32_MAX"); + } PAIMON_ASSIGN_OR_RAISE( std::string compress_str, OptionsUtils::GetValueFromMap(options_, BtreeDefs::kBtreeIndexCompression, @@ -82,8 +85,9 @@ Result> BTreeGlobalIndexer::CreateWriter( CompressOptions compress_options{compress_str, compress_level}; PAIMON_ASSIGN_OR_RAISE(std::shared_ptr compression_factory, BlockCompressionFactory::Create(compress_options)); - return BTreeGlobalIndexWriter::Create(field_name, struct_type, file_writer, block_size, - compression_factory, pool); + return BTreeGlobalIndexWriter::Create(field_name, struct_type, file_writer, + static_cast(block_size), compression_factory, + pool); } Result> BTreeGlobalIndexer::CreateReader( diff --git a/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.cpp b/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.cpp index de054607e..f2ae0338e 100644 --- a/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.cpp +++ b/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.cpp @@ -24,6 +24,7 @@ #include "paimon/common/global_index/btree/btree_global_index_reader.h" #include "paimon/common/global_index/btree/btree_index_meta.h" #include "paimon/common/global_index/btree/key_serializer.h" +#include "paimon/common/global_index/union_global_index_reader.h" #include "paimon/common/memory/memory_slice.h" #include "paimon/common/memory/memory_slice_input.h" #include "paimon/common/sst/block_cache.h" @@ -170,55 +171,23 @@ Result> LazyFilteredBTreeReader::DispatchVisi return std::make_shared([]() { return RoaringBitmap64(); }); } - // Prepare all readers sequentially (reader_cache_ is not thread-safe) + // Create a UnionGlobalIndexReader from cached readers for the selected files + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr union_reader, + CreateUnionReader(selected_files)); + + // Delegate the action to the union reader + return action(union_reader); +} + +Result> LazyFilteredBTreeReader::CreateUnionReader( + const std::vector& files) { std::vector> readers; - readers.reserve(selected_files.size()); - for (const auto& meta : selected_files) { + readers.reserve(files.size()); + for (const auto& meta : files) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr reader, GetOrCreateReader(meta)); readers.push_back(std::move(reader)); } - - // Execute actions: parallel if executor is available, sequential otherwise - std::vector>> collected_results; - if (executor_ != nullptr) { - // Parallel: submit all tasks to executor, then collect results in order - std::vector>>> futures; - futures.reserve(readers.size()); - for (const auto& reader : readers) { - futures.push_back( - Via(executor_.get(), - [&action, &reader]() -> Result> { - return action(reader); - })); - } - collected_results = CollectAll(futures); - } else { - // Sequential fallback: execute actions one by one - collected_results.reserve(readers.size()); - for (const auto& reader : readers) { - collected_results.push_back(action(reader)); - } - } - - // Merge results in submission order - std::shared_ptr merged_result = nullptr; - for (auto& result_or_status : collected_results) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, - std::move(result_or_status)); - if (result == nullptr) { - continue; - } - if (merged_result == nullptr) { - merged_result = std::move(result); - } else { - PAIMON_ASSIGN_OR_RAISE(merged_result, merged_result->Or(result)); - } - } - - if (merged_result == nullptr) { - return Status::Invalid("DispatchVisit cannot return empty result"); - } - return merged_result; + return std::make_shared(std::move(readers), executor_); } Result> LazyFilteredBTreeReader::GetOrCreateReader( diff --git a/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.h b/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.h index 372ba7691..4889b1e4e 100644 --- a/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.h +++ b/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.h @@ -82,6 +82,8 @@ class LazyFilteredBTreeReader : public GlobalIndexReader { Result> DispatchVisit(SelectAction select_files, ReaderAction action); + Result> CreateUnionReader( + const std::vector& files); Result> GetOrCreateReader(const GlobalIndexIOMeta& meta); Result> CreateSingleReader(const GlobalIndexIOMeta& meta); Result ReadNullBitmap(const std::shared_ptr& cache, diff --git a/src/paimon/common/global_index/offset_global_index_reader.cpp b/src/paimon/common/global_index/offset_global_index_reader.cpp new file mode 100644 index 000000000..2e3130f6f --- /dev/null +++ b/src/paimon/common/global_index/offset_global_index_reader.cpp @@ -0,0 +1,151 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/global_index/offset_global_index_reader.h" + +#include + +namespace paimon { + +OffsetGlobalIndexReader::OffsetGlobalIndexReader(std::shared_ptr&& wrapped, + int64_t offset) + : wrapped_(std::move(wrapped)), offset_(offset) {} + +Result> OffsetGlobalIndexReader::VisitIsNotNull() { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, wrapped_->VisitIsNotNull()); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitIsNull() { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, wrapped_->VisitIsNull()); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitEqual( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitEqual(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitNotEqual( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitNotEqual(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitLessThan( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitLessThan(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitLessOrEqual( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitLessOrEqual(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitGreaterThan( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitGreaterThan(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitGreaterOrEqual( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitGreaterOrEqual(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitIn( + const std::vector& literals) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, wrapped_->VisitIn(literals)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitNotIn( + const std::vector& literals) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitNotIn(literals)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitStartsWith( + const Literal& prefix) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitStartsWith(prefix)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitEndsWith( + const Literal& suffix) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitEndsWith(suffix)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitContains( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitContains(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitLike( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, wrapped_->VisitLike(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitVectorSearch( + const std::shared_ptr& vector_search) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitVectorSearch(vector_search)); + if (result == nullptr) { + return std::shared_ptr(); + } + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr offset_result, + result->AddOffset(offset_)); + auto scored_result = std::dynamic_pointer_cast(offset_result); + if (!scored_result) { + return Status::Invalid( + "AddOffset on ScoredGlobalIndexResult did not return ScoredGlobalIndexResult"); + } + return scored_result; +} + +Result> OffsetGlobalIndexReader::VisitFullTextSearch( + const std::shared_ptr& full_text_search) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitFullTextSearch(full_text_search)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::ApplyOffset( + const std::shared_ptr& result) { + if (result == nullptr) { + return result; + } + return result->AddOffset(offset_); +} + +} // namespace paimon diff --git a/src/paimon/common/global_index/offset_global_index_reader.h b/src/paimon/common/global_index/offset_global_index_reader.h new file mode 100644 index 000000000..8b2d6034c --- /dev/null +++ b/src/paimon/common/global_index/offset_global_index_reader.h @@ -0,0 +1,77 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/global_index/global_index_reader.h" + +namespace paimon { + +/// A GlobalIndexReader that wraps another reader and applies an offset to all row ids in the +/// results. This is used to convert local row IDs into global row IDs. +class OffsetGlobalIndexReader : public GlobalIndexReader { + public: + /// Constructs an OffsetGlobalIndexReader. + /// @param wrapped The inner reader to delegate queries to. Its results are expected to + /// contain local row ids. + /// @param offset The offset to add to each row id in the results. + OffsetGlobalIndexReader(std::shared_ptr&& wrapped, int64_t offset); + + Result> VisitIsNotNull() override; + Result> VisitIsNull() override; + Result> VisitEqual(const Literal& literal) override; + Result> VisitNotEqual(const Literal& literal) override; + Result> VisitLessThan(const Literal& literal) override; + Result> VisitLessOrEqual(const Literal& literal) override; + Result> VisitGreaterThan(const Literal& literal) override; + Result> VisitGreaterOrEqual(const Literal& literal) override; + Result> VisitIn( + const std::vector& literals) override; + Result> VisitNotIn( + const std::vector& literals) override; + Result> VisitStartsWith(const Literal& prefix) override; + Result> VisitEndsWith(const Literal& suffix) override; + Result> VisitContains(const Literal& literal) override; + Result> VisitLike(const Literal& literal) override; + + Result> VisitVectorSearch( + const std::shared_ptr& vector_search) override; + + Result> VisitFullTextSearch( + const std::shared_ptr& full_text_search) override; + + bool IsThreadSafe() const override { + return wrapped_->IsThreadSafe(); + } + + std::string GetIndexType() const override { + return wrapped_->GetIndexType(); + } + + private: + Result> ApplyOffset( + const std::shared_ptr& result); + + private: + std::shared_ptr wrapped_; + int64_t offset_; +}; + +} // namespace paimon diff --git a/src/paimon/common/global_index/offset_global_index_reader_test.cpp b/src/paimon/common/global_index/offset_global_index_reader_test.cpp new file mode 100644 index 000000000..8cee508f1 --- /dev/null +++ b/src/paimon/common/global_index/offset_global_index_reader_test.cpp @@ -0,0 +1,311 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/global_index/offset_global_index_reader.h" + +#include +#include + +#include "gtest/gtest.h" +#include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/predicate/literal.h" +#include "paimon/testing/utils/testharness.h" +#include "paimon/utils/roaring_bitmap64.h" + +namespace paimon::test { +class FakeGlobalIndexReader : public GlobalIndexReader { + public: + void SetDefaultResult(const std::vector& row_ids) { + default_result_ = row_ids; + } + + Result> VisitIsNotNull() override { + return MakeResult(default_result_); + } + + Result> VisitIsNull() override { + return MakeResult(default_result_); + } + + Result> VisitEqual(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitNotEqual(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitLessThan(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitLessOrEqual(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitGreaterThan(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitGreaterOrEqual( + const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitIn( + const std::vector& literals) override { + return MakeResult(default_result_); + } + + Result> VisitNotIn( + const std::vector& literals) override { + return MakeResult(default_result_); + } + + Result> VisitStartsWith(const Literal& prefix) override { + return MakeResult(default_result_); + } + + Result> VisitEndsWith(const Literal& suffix) override { + return MakeResult(default_result_); + } + + Result> VisitContains(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitLike(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitVectorSearch( + const std::shared_ptr& vector_search) override { + return Status::Invalid("FakeGlobalIndexReader does not support vector search"); + } + + Result> VisitFullTextSearch( + const std::shared_ptr& full_text_search) override { + return MakeResult(default_result_); + } + + bool IsThreadSafe() const override { + return true; + } + + std::string GetIndexType() const override { + return "fake"; + } + + private: + static Result> MakeResult( + const std::vector& row_ids) { + auto ids = row_ids; + return std::make_shared( + [ids]() { return RoaringBitmap64::From(ids); }); + } + + private: + std::vector default_result_; +}; + +class OffsetGlobalIndexReaderTest : public ::testing::Test { + public: + void CheckResult(const std::shared_ptr& result, + const std::vector& expected) const { + ASSERT_TRUE(result); + auto typed_result = std::dynamic_pointer_cast(result); + ASSERT_TRUE(typed_result); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, typed_result->GetBitmap()); + ASSERT_TRUE(bitmap); + ASSERT_EQ(*bitmap, RoaringBitmap64::From(expected)) + << "result=" << bitmap->ToString() + << ", expected=" << RoaringBitmap64::From(expected).ToString(); + } +}; + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitEqualWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 1, 3}); + + auto offset_reader = std::make_shared(fake_reader, 100); + + Literal literal_5(5); + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitEqual(literal_5)); + // row ids {0, 1, 3} + offset 100 -> {100, 101, 103} + CheckResult(result, {100, 101, 103}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitIsNotNullWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 2, 4, 6}); + + auto offset_reader = std::make_shared(fake_reader, 50); + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitIsNotNull()); + // row ids {0, 2, 4, 6} + offset 50 -> {50, 52, 54, 56} + CheckResult(result, {50, 52, 54, 56}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitIsNullWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({1, 3}); + + auto offset_reader = std::make_shared(fake_reader, 200); + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitIsNull()); + // row ids {1, 3} + offset 200 -> {201, 203} + CheckResult(result, {201, 203}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitLessThanWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 1}); + + auto offset_reader = std::make_shared(fake_reader, 10); + + Literal literal_5(5); + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitLessThan(literal_5)); + // row ids {0, 1} + offset 10 -> {10, 11} + CheckResult(result, {10, 11}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitGreaterOrEqualWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({3, 4, 5}); + + auto offset_reader = std::make_shared(fake_reader, 1000); + + Literal literal_10(10); + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitGreaterOrEqual(literal_10)); + // row ids {3, 4, 5} + offset 1000 -> {1003, 1004, 1005} + CheckResult(result, {1003, 1004, 1005}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitInWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 2}); + + auto offset_reader = std::make_shared(fake_reader, 5); + + std::vector literals = {Literal(1), Literal(3)}; + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitIn(literals)); + // row ids {0, 2} + offset 5 -> {5, 7} + CheckResult(result, {5, 7}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitNotInWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({1, 3, 5}); + + auto offset_reader = std::make_shared(fake_reader, 20); + + std::vector literals = {Literal(2)}; + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitNotIn(literals)); + // row ids {1, 3, 5} + offset 20 -> {21, 23, 25} + CheckResult(result, {21, 23, 25}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitNotEqualWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 2, 4}); + + auto offset_reader = std::make_shared(fake_reader, 7); + + Literal literal_1(1); + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitNotEqual(literal_1)); + // row ids {0, 2, 4} + offset 7 -> {7, 9, 11} + CheckResult(result, {7, 9, 11}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitLessOrEqualWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 1, 2}); + + auto offset_reader = std::make_shared(fake_reader, 30); + + Literal literal_3(3); + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitLessOrEqual(literal_3)); + // row ids {0, 1, 2} + offset 30 -> {30, 31, 32} + CheckResult(result, {30, 31, 32}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitGreaterThanWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({4, 5}); + + auto offset_reader = std::make_shared(fake_reader, 15); + + Literal literal_3(3); + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitGreaterThan(literal_3)); + // row ids {4, 5} + offset 15 -> {19, 20} + CheckResult(result, {19, 20}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestZeroOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 1, 2}); + + auto offset_reader = std::make_shared(fake_reader, 0); + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitIsNotNull()); + // row ids {0, 1, 2} + offset 0 -> {0, 1, 2} + CheckResult(result, {0, 1, 2}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestEmptyResultWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({}); + + auto offset_reader = std::make_shared(fake_reader, 100); + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitIsNotNull()); + // empty result + offset 100 -> still empty + CheckResult(result, {}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestIsThreadSafeDelegated) { + auto fake_reader = std::make_shared(); + auto offset_reader = std::make_shared(fake_reader, 100); + // FakeGlobalIndexReader returns true for IsThreadSafe + ASSERT_TRUE(offset_reader->IsThreadSafe()); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestGetIndexTypeDelegated) { + auto fake_reader = std::make_shared(); + auto offset_reader = std::make_shared(fake_reader, 100); + ASSERT_EQ(offset_reader->GetIndexType(), "fake"); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitFullTextSearchWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 3, 5}); + + auto offset_reader = std::make_shared(fake_reader, 10); + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitFullTextSearch(nullptr)); + // row ids {0, 3, 5} + offset 10 -> {10, 13, 15} + CheckResult(result, {10, 13, 15}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitVectorSearchNotSupported) { + auto fake_reader = std::make_shared(); + auto offset_reader = std::make_shared(fake_reader, 10); + // FakeGlobalIndexReader returns error for VectorSearch + ASSERT_NOK_WITH_MSG(offset_reader->VisitVectorSearch(nullptr), + "FakeGlobalIndexReader does not support vector search"); +} + +} // namespace paimon::test diff --git a/src/paimon/common/global_index/rangebitmap/range_bitmap_global_index_test.cpp b/src/paimon/common/global_index/rangebitmap/range_bitmap_global_index_test.cpp index 1547ee95f..14854f189 100644 --- a/src/paimon/common/global_index/rangebitmap/range_bitmap_global_index_test.cpp +++ b/src/paimon/common/global_index/rangebitmap/range_bitmap_global_index_test.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/src/paimon/common/global_index/union_global_index_reader.cpp b/src/paimon/common/global_index/union_global_index_reader.cpp new file mode 100644 index 000000000..6108dcec9 --- /dev/null +++ b/src/paimon/common/global_index/union_global_index_reader.cpp @@ -0,0 +1,207 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/global_index/union_global_index_reader.h" + +#include +#include + +#include "paimon/common/executor/future.h" + +namespace paimon { +UnionGlobalIndexReader::UnionGlobalIndexReader( + std::vector>&& readers, + const std::shared_ptr& executor) + : readers_(std::move(readers)), executor_(executor) {} + +Result> UnionGlobalIndexReader::VisitIsNotNull() { + return Union( + [](const std::shared_ptr& reader) { return reader->VisitIsNotNull(); }); +} + +Result> UnionGlobalIndexReader::VisitIsNull() { + return Union( + [](const std::shared_ptr& reader) { return reader->VisitIsNull(); }); +} + +Result> UnionGlobalIndexReader::VisitEqual( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitEqual(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitNotEqual( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitNotEqual(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitLessThan( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitLessThan(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitLessOrEqual( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitLessOrEqual(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitGreaterThan( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitGreaterThan(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitGreaterOrEqual( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitGreaterOrEqual(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitIn( + const std::vector& literals) { + return Union([&literals](const std::shared_ptr& reader) { + return reader->VisitIn(literals); + }); +} + +Result> UnionGlobalIndexReader::VisitNotIn( + const std::vector& literals) { + return Union([&literals](const std::shared_ptr& reader) { + return reader->VisitNotIn(literals); + }); +} + +Result> UnionGlobalIndexReader::VisitStartsWith( + const Literal& prefix) { + return Union([&prefix](const std::shared_ptr& reader) { + return reader->VisitStartsWith(prefix); + }); +} + +Result> UnionGlobalIndexReader::VisitEndsWith( + const Literal& suffix) { + return Union([&suffix](const std::shared_ptr& reader) { + return reader->VisitEndsWith(suffix); + }); +} + +Result> UnionGlobalIndexReader::VisitContains( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitContains(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitLike( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitLike(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitVectorSearch( + const std::shared_ptr& vector_search) { + auto results = ExecuteAllReaders>>( + [&vector_search](const std::shared_ptr& reader) + -> Result> { + return reader->VisitVectorSearch(vector_search); + }); + + std::shared_ptr merged_result = nullptr; + for (auto& result_or_status : results) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + std::move(result_or_status)); + if (result == nullptr) { + continue; + } + if (merged_result == nullptr) { + merged_result = std::move(result); + } else { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr merged_as_base, + merged_result->Or(result)); + merged_result = std::dynamic_pointer_cast(merged_as_base); + if (!merged_result) { + return Status::Invalid( + "Or of ScoredGlobalIndexResult did not return ScoredGlobalIndexResult in " + "UnionGlobalIndexReader"); + } + } + } + + return merged_result; +} + +Result> UnionGlobalIndexReader::VisitFullTextSearch( + const std::shared_ptr& full_text_search) { + return Union([&full_text_search](const std::shared_ptr& reader) { + return reader->VisitFullTextSearch(full_text_search); + }); +} + +Result> UnionGlobalIndexReader::Union(ReaderAction action) { + auto results = ExecuteAllReaders>>( + [&action](const std::shared_ptr& reader) + -> Result> { return action(reader); }); + + std::shared_ptr merged_result = nullptr; + for (auto& result_or_status : results) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + std::move(result_or_status)); + if (result == nullptr) { + continue; + } + if (merged_result == nullptr) { + merged_result = std::move(result); + } else { + PAIMON_ASSIGN_OR_RAISE(merged_result, merged_result->Or(result)); + } + } + + return merged_result; +} + +template +std::vector UnionGlobalIndexReader::ExecuteAllReaders( + const std::function&)>& action) { + if (executor_ == nullptr) { + std::vector results; + results.reserve(readers_.size()); + for (const auto& reader : readers_) { + results.push_back(action(reader)); + } + return results; + } + + // Parallel: submit all tasks to executor, then collect results in submission order + std::vector> futures; + futures.reserve(readers_.size()); + for (const auto& reader : readers_) { + futures.push_back( + Via(executor_.get(), [&action, &reader]() -> R { return action(reader); })); + } + return CollectAll(futures); +} + +} // namespace paimon diff --git a/src/paimon/common/global_index/union_global_index_reader.h b/src/paimon/common/global_index/union_global_index_reader.h new file mode 100644 index 000000000..1f253ca47 --- /dev/null +++ b/src/paimon/common/global_index/union_global_index_reader.h @@ -0,0 +1,85 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "paimon/executor.h" +#include "paimon/global_index/global_index_reader.h" + +namespace paimon { +/// A GlobalIndexReader that combines results from multiple readers by performing a union +/// operation on their results. +/// +/// When an executor is provided, all sub-reader actions are submitted in parallel and results are +/// collected in submission order. Otherwise, sub-readers are evaluated sequentially. +class UnionGlobalIndexReader : public GlobalIndexReader { + public: + UnionGlobalIndexReader(std::vector>&& readers, + const std::shared_ptr& executor); + + Result> VisitIsNotNull() override; + Result> VisitIsNull() override; + Result> VisitEqual(const Literal& literal) override; + Result> VisitNotEqual(const Literal& literal) override; + Result> VisitLessThan(const Literal& literal) override; + Result> VisitLessOrEqual(const Literal& literal) override; + Result> VisitGreaterThan(const Literal& literal) override; + Result> VisitGreaterOrEqual(const Literal& literal) override; + Result> VisitIn( + const std::vector& literals) override; + Result> VisitNotIn( + const std::vector& literals) override; + Result> VisitStartsWith(const Literal& prefix) override; + Result> VisitEndsWith(const Literal& suffix) override; + Result> VisitContains(const Literal& literal) override; + Result> VisitLike(const Literal& literal) override; + + Result> VisitVectorSearch( + const std::shared_ptr& vector_search) override; + + Result> VisitFullTextSearch( + const std::shared_ptr& full_text_search) override; + + bool IsThreadSafe() const override { + return false; + } + + std::string GetIndexType() const override { + return "union"; + } + + private: + using ReaderAction = std::function>( + const std::shared_ptr&)>; + + /// Executes the given action on all readers and merges results with Union. + Result> Union(ReaderAction action); + + /// Executes the given action on all readers (parallel or sequential) and collects results. + template + std::vector ExecuteAllReaders( + const std::function&)>& action); + + std::vector> readers_; + std::shared_ptr executor_; +}; + +} // namespace paimon diff --git a/src/paimon/common/global_index/union_global_index_reader_test.cpp b/src/paimon/common/global_index/union_global_index_reader_test.cpp new file mode 100644 index 000000000..f6f77d464 --- /dev/null +++ b/src/paimon/common/global_index/union_global_index_reader_test.cpp @@ -0,0 +1,460 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/global_index/union_global_index_reader.h" + +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "paimon/executor.h" +#include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/predicate/literal.h" +#include "paimon/testing/utils/testharness.h" +#include "paimon/utils/roaring_bitmap64.h" + +namespace paimon::test { +class FakeReader : public GlobalIndexReader { + public: + /// Sets the result returned by all Visit* methods (default behavior). + /// Pass an empty vector for an empty bitmap. + void SetDefaultResult(const std::vector& row_ids) { + default_result_ = row_ids; + return_nullptr_ = false; + return_error_ = false; + } + + /// Configures this reader to return nullptr for all Visit* methods. + void SetReturnNullptr() { + return_nullptr_ = true; + return_error_ = false; + } + + /// Configures this reader to return an error Status for all Visit* methods. + void SetReturnError(const std::string& message) { + return_error_ = true; + return_nullptr_ = false; + error_message_ = message; + } + + /// Counts how many times any Visit* method was invoked. Useful to assert all readers + /// are exercised by UnionGlobalIndexReader. + int InvocationCount() const { + return invocation_count_.load(); + } + + Result> VisitIsNotNull() override { + return MakeResult(); + } + + Result> VisitIsNull() override { + return MakeResult(); + } + + Result> VisitEqual(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitNotEqual(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitLessThan(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitLessOrEqual(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitGreaterThan(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitGreaterOrEqual( + const Literal& literal) override { + return MakeResult(); + } + + Result> VisitIn( + const std::vector& literals) override { + return MakeResult(); + } + + Result> VisitNotIn( + const std::vector& literals) override { + return MakeResult(); + } + + Result> VisitStartsWith(const Literal& prefix) override { + return MakeResult(); + } + + Result> VisitEndsWith(const Literal& suffix) override { + return MakeResult(); + } + + Result> VisitContains(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitLike(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitVectorSearch( + const std::shared_ptr& vector_search) override { + invocation_count_++; + if (return_error_) { + return Status::Invalid(error_message_); + } + return std::shared_ptr(nullptr); + } + + Result> VisitFullTextSearch( + const std::shared_ptr& full_text_search) override { + return MakeResult(); + } + + bool IsThreadSafe() const override { + return true; + } + + std::string GetIndexType() const override { + return "fake"; + } + + private: + Result> MakeResult() { + invocation_count_++; + if (return_error_) { + return Status::Invalid(error_message_); + } + if (return_nullptr_) { + return std::shared_ptr(nullptr); + } + auto ids = default_result_; + return std::make_shared( + [ids]() { return RoaringBitmap64::From(ids); }); + } + + private: + std::vector default_result_; + bool return_nullptr_ = false; + bool return_error_ = false; + std::string error_message_; + std::atomic invocation_count_{0}; +}; + +class UnionGlobalIndexReaderTest : public ::testing::Test { + public: + static void CheckResult(const std::shared_ptr& result, + const std::vector& expected) { + ASSERT_TRUE(result); + auto typed_result = std::dynamic_pointer_cast(result); + ASSERT_TRUE(typed_result); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, typed_result->GetBitmap()); + ASSERT_TRUE(bitmap); + ASSERT_EQ(*bitmap, RoaringBitmap64::From(expected)) + << "result=" << bitmap->ToString() + << ", expected=" << RoaringBitmap64::From(expected).ToString(); + } +}; + +TEST_F(UnionGlobalIndexReaderTest, TestSingleReaderUnion) { + auto reader = std::make_shared(); + reader->SetDefaultResult({1, 2, 3}); + + std::vector> readers = {reader}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + CheckResult(result, {1, 2, 3}); + ASSERT_EQ(reader->InvocationCount(), 1); +} + +TEST_F(UnionGlobalIndexReaderTest, TestMultipleReadersUnionSequential) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + auto reader3 = std::make_shared(); + reader1->SetDefaultResult({1, 2}); + reader2->SetDefaultResult({3, 4}); + reader3->SetDefaultResult({5}); + + std::vector> readers = {reader1, reader2, reader3}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + // {1,2} OR {3,4} OR {5} -> {1,2,3,4,5} + CheckResult(result, {1, 2, 3, 4, 5}); + ASSERT_EQ(reader1->InvocationCount(), 1); + ASSERT_EQ(reader2->InvocationCount(), 1); + ASSERT_EQ(reader3->InvocationCount(), 1); +} + +TEST_F(UnionGlobalIndexReaderTest, TestMultipleReadersUnionOverlappingIds) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1, 2, 3}); + reader2->SetDefaultResult({2, 3, 4}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + // {1,2,3} OR {2,3,4} -> {1,2,3,4} + CheckResult(result, {1, 2, 3, 4}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestMultipleReadersUnionWithExecutor) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + auto reader3 = std::make_shared(); + reader1->SetDefaultResult({10}); + reader2->SetDefaultResult({20}); + reader3->SetDefaultResult({30}); + + std::vector> readers = {reader1, reader2, reader3}; + std::shared_ptr executor = CreateDefaultExecutor(); + UnionGlobalIndexReader union_reader(std::move(readers), executor); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + CheckResult(result, {10, 20, 30}); + ASSERT_EQ(reader1->InvocationCount(), 1); + ASSERT_EQ(reader2->InvocationCount(), 1); + ASSERT_EQ(reader3->InvocationCount(), 1); +} + +TEST_F(UnionGlobalIndexReaderTest, TestEmptyReaderList) { + std::vector> readers; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + // No readers means no results to merge -> nullptr + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + ASSERT_FALSE(result); +} + +TEST_F(UnionGlobalIndexReaderTest, TestAllReadersReturnNullptr) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetReturnNullptr(); + reader2->SetReturnNullptr(); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + // All readers return nullptr -> merged result is nullptr + ASSERT_FALSE(result); +} + +TEST_F(UnionGlobalIndexReaderTest, TestPartialReadersReturnNullptr) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + auto reader3 = std::make_shared(); + reader1->SetReturnNullptr(); + reader2->SetDefaultResult({1, 2}); + reader3->SetReturnNullptr(); + + std::vector> readers = {reader1, reader2, reader3}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + // Nullptrs are skipped, only reader2's result is used + CheckResult(result, {1, 2}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestErrorPropagationSequential) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1, 2}); + reader2->SetReturnError("Unknown error for reader2"); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_NOK_WITH_MSG(union_reader.VisitIsNotNull(), "Unknown error for reader2"); +} + +TEST_F(UnionGlobalIndexReaderTest, TestErrorPropagationWithExecutor) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1}); + reader2->SetReturnError("Unknown error for reader2"); + + std::vector> readers = {reader1, reader2}; + std::shared_ptr executor = CreateDefaultExecutor(); + UnionGlobalIndexReader union_reader(std::move(readers), executor); + + ASSERT_NOK_WITH_MSG(union_reader.VisitIsNotNull(), "Unknown error for reader2"); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitEqualUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1}); + reader2->SetDefaultResult({2}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + Literal literal_42(42); + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitEqual(literal_42)); + CheckResult(result, {1, 2}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitNotEqualUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1}); + reader2->SetDefaultResult({2}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + Literal literal_42(42); + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitNotEqual(literal_42)); + CheckResult(result, {1, 2}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitRangeQueriesUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1}); + reader2->SetDefaultResult({2}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + Literal literal_5(5); + ASSERT_OK_AND_ASSIGN(auto lt, union_reader.VisitLessThan(literal_5)); + CheckResult(lt, {1, 2}); + ASSERT_OK_AND_ASSIGN(auto le, union_reader.VisitLessOrEqual(literal_5)); + CheckResult(le, {1, 2}); + ASSERT_OK_AND_ASSIGN(auto gt, union_reader.VisitGreaterThan(literal_5)); + CheckResult(gt, {1, 2}); + ASSERT_OK_AND_ASSIGN(auto ge, union_reader.VisitGreaterOrEqual(literal_5)); + CheckResult(ge, {1, 2}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitInUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1, 3}); + reader2->SetDefaultResult({2, 4}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + std::vector literals = {Literal(10), Literal(20)}; + ASSERT_OK_AND_ASSIGN(auto in_result, union_reader.VisitIn(literals)); + CheckResult(in_result, {1, 2, 3, 4}); + + ASSERT_OK_AND_ASSIGN(auto not_in_result, union_reader.VisitNotIn(literals)); + CheckResult(not_in_result, {1, 2, 3, 4}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitStringQueriesUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1}); + reader2->SetDefaultResult({2}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + Literal literal_str(FieldType::STRING, "abc", 3); + ASSERT_OK_AND_ASSIGN(auto starts, union_reader.VisitStartsWith(literal_str)); + CheckResult(starts, {1, 2}); + ASSERT_OK_AND_ASSIGN(auto ends, union_reader.VisitEndsWith(literal_str)); + CheckResult(ends, {1, 2}); + ASSERT_OK_AND_ASSIGN(auto contains, union_reader.VisitContains(literal_str)); + CheckResult(contains, {1, 2}); + ASSERT_OK_AND_ASSIGN(auto like, union_reader.VisitLike(literal_str)); + CheckResult(like, {1, 2}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitIsNullUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({100, 200}); + reader2->SetDefaultResult({300}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNull()); + CheckResult(result, {100, 200, 300}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitFullTextSearchUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1, 5}); + reader2->SetDefaultResult({2, 6}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitFullTextSearch(nullptr)); + CheckResult(result, {1, 2, 5, 6}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchAllNullptr) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1}); + reader2->SetDefaultResult({2}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitVectorSearch(nullptr)); + ASSERT_FALSE(result); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchErrorPropagation) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1}); + reader2->SetReturnError("vector search failure"); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_NOK_WITH_MSG(union_reader.VisitVectorSearch(nullptr), "vector search failure"); +} + +TEST_F(UnionGlobalIndexReaderTest, TestIsThreadSafeAlwaysFalse) { + auto reader = std::make_shared(); + std::vector> readers = {reader}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + // UnionGlobalIndexReader is not thread-safe regardless of inner readers + ASSERT_FALSE(union_reader.IsThreadSafe()); +} + +TEST_F(UnionGlobalIndexReaderTest, TestGetIndexTypeReturnsUnion) { + std::vector> readers; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_EQ(union_reader.GetIndexType(), "union"); +} + +} // namespace paimon::test diff --git a/src/paimon/common/sst/sst_file_reader.cpp b/src/paimon/common/sst/sst_file_reader.cpp index 8243048b2..6b004c754 100644 --- a/src/paimon/common/sst/sst_file_reader.cpp +++ b/src/paimon/common/sst/sst_file_reader.cpp @@ -154,7 +154,7 @@ Result SstFileReader::DecompressBlock(const MemorySegment& compre crc32c_code = CRC32C::calculate(&compression_val, 1, crc32c_code); if (trailer->Crc32c() != static_cast(crc32c_code)) { return Status::Invalid(fmt::format("Expected crc32c({:#x}) but found crc32c({:#x})", - trailer->Crc32c(), crc32c_code)); + static_cast(trailer->Crc32c()), crc32c_code)); } // decompress data diff --git a/src/paimon/common/utils/row_range_index.cpp b/src/paimon/common/utils/row_range_index.cpp index f2eab9177..e78eb2a6b 100644 --- a/src/paimon/common/utils/row_range_index.cpp +++ b/src/paimon/common/utils/row_range_index.cpp @@ -17,7 +17,7 @@ #include "paimon/utils/row_range_index.h" #include -#include +#include namespace paimon { diff --git a/src/paimon/core/append/append_compact_coordinator.cpp b/src/paimon/core/append/append_compact_coordinator.cpp index 174909604..4d36436d1 100644 --- a/src/paimon/core/append/append_compact_coordinator.cpp +++ b/src/paimon/core/append/append_compact_coordinator.cpp @@ -235,8 +235,7 @@ Result>>> Sca const std::vector>& partitions, const std::shared_ptr& executor, const std::shared_ptr& pool) { auto scan_filter = std::make_shared( - /*predicate=*/nullptr, partitions, /*bucket_filter=*/std::nullopt, - /*vector_search=*/nullptr); + /*predicate=*/nullptr, partitions, /*bucket_filter=*/std::nullopt); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr scan, CreateFileStoreScan(snapshot_manager, schema_manager, table_schema, diff --git a/src/paimon/core/global_index/global_index_evaluator.h b/src/paimon/core/global_index/global_index_evaluator.h index 05342d2f6..0a624b1d1 100644 --- a/src/paimon/core/global_index/global_index_evaluator.h +++ b/src/paimon/core/global_index/global_index_evaluator.h @@ -17,10 +17,12 @@ #pragma once #include +#include #include "paimon/global_index/global_index_result.h" #include "paimon/predicate/predicate.h" #include "paimon/predicate/vector_search.h" +#include "paimon/utils/row_range_index.h" #include "paimon/visibility.h" namespace paimon { @@ -30,21 +32,20 @@ class PAIMON_EXPORT GlobalIndexEvaluator { virtual ~GlobalIndexEvaluator() = default; /// Evaluates a predicate against the global index. /// - /// @param predicate The filter predicate to evaluate. - /// @param vector_search The vector similarity search to evaluate. - /// @note When both `predicate` and `vector_search` are present, the predicate - /// is used to constrain the vector search space (for example, via a - /// pre-filter callback that may be applied during vector search), so - /// vector similarity scoring is effectively limited to rows that satisfy - /// the predicate. + /// @param predicate The filter predicate to evaluate. + /// @param row_range_index Optional row range that limits evaluation to the given + /// ranges of row ids. Index files whose row range does not + /// intersect with `row_range_index` will be skipped. If a field has + /// no usable index file in the requested range, the evaluator + /// returns `nullptr` for that field. /// @return A `Result` containing: /// - `nullptr` if the predicate cannot be evaluated by this index (e.g., field has - /// no index), + /// no index, or no index file intersects with `row_range_index`), /// - A `std::shared_ptr` if evaluation succeeds. /// The `GlobalIndexResult` indicates the matching rows (e.g., via row ID bitmaps). virtual Result> Evaluate( const std::shared_ptr& predicate, - const std::shared_ptr& vector_search) = 0; + const std::optional& row_range_index) = 0; }; } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_evaluator_impl.cpp b/src/paimon/core/global_index/global_index_evaluator_impl.cpp index 12093fba1..a0ffcf8b8 100644 --- a/src/paimon/core/global_index/global_index_evaluator_impl.cpp +++ b/src/paimon/core/global_index/global_index_evaluator_impl.cpp @@ -24,83 +24,52 @@ namespace paimon { Result> GlobalIndexEvaluatorImpl::Evaluate( const std::shared_ptr& predicate, - const std::shared_ptr& vector_search) { + const std::optional& row_range_index) { std::shared_ptr compound_result; if (predicate) { - PAIMON_ASSIGN_OR_RAISE(compound_result, EvaluatePredicate(predicate)); - } - if (vector_search) { - PAIMON_ASSIGN_OR_RAISE( - compound_result, - EvaluateVectorSearch(vector_search, /*predicate_result=*/compound_result)); + ReadersCache cache; + PAIMON_ASSIGN_OR_RAISE(compound_result, + EvaluatePredicate(predicate, row_range_index, cache)); } return compound_result; } Result>> GlobalIndexEvaluatorImpl::GetIndexReaders( - const std::string& field_name) { + const std::string& field_name, const std::optional& row_range_index, + ReadersCache& cache) { PAIMON_ASSIGN_OR_RAISE(DataField data_field, table_schema_->GetField(field_name)); int32_t field_id = data_field.Id(); // get or create global index readers for current field std::vector> readers; - auto iter = index_readers_cache_.find(field_id); - if (iter != index_readers_cache_.end()) { + auto iter = cache.find(field_id); + if (iter != cache.end()) { readers = iter->second; } else { - PAIMON_ASSIGN_OR_RAISE(readers, create_index_readers_(field_id)); - index_readers_cache_.insert({field_id, readers}); + PAIMON_ASSIGN_OR_RAISE(readers, create_index_readers_(field_id, row_range_index)); + cache.insert({field_id, readers}); } return readers; } -Result> GlobalIndexEvaluatorImpl::EvaluateVectorSearch( - const std::shared_ptr& vector_search, - const std::shared_ptr& predicate_result) { - PAIMON_ASSIGN_OR_RAISE(std::vector> readers, - GetIndexReaders(vector_search->field_name)); - if (readers.empty()) { - return predicate_result; - } - if (readers.size() > 1) { - return Status::Invalid("Vector search cannot have multiple global indexes"); - } - const auto& vector_search_reader = readers[0]; - if (predicate_result && vector_search->pre_filter != nullptr) { - return Status::Invalid("Predicate result and pre_filter in VectorSearch conflict"); - } - auto final_vector_search = vector_search; - if (predicate_result) { - auto bitmap_global_index_result = - std::dynamic_pointer_cast(predicate_result); - if (!bitmap_global_index_result) { - return Status::Invalid( - "The pre_filter of vector search only supports BitmapGlobalIndexResult"); - } - PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* bitmap, - bitmap_global_index_result->GetBitmap()); - assert(bitmap); - final_vector_search = vector_search->ReplacePreFilter( - [bitmap_global_index_result, bitmap](int64_t row_id) -> bool { - return bitmap->Contains(row_id); - }); - } - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr scored_result, - vector_search_reader->VisitVectorSearch(final_vector_search)); - return scored_result; -} - Result> GlobalIndexEvaluatorImpl::EvaluatePredicate( - const std::shared_ptr& predicate) { + const std::shared_ptr& predicate, + const std::optional& row_range_index, ReadersCache& cache) { if (predicate == nullptr) { return std::shared_ptr(nullptr); } if (auto compound_predicate = std::dynamic_pointer_cast(predicate)) { - return EvaluateCompoundPredicate(compound_predicate); + return EvaluateCompoundPredicate(compound_predicate, row_range_index, cache); } else if (auto leaf_predicate = std::dynamic_pointer_cast(predicate)) { const std::string& field_name = leaf_predicate->FieldName(); PAIMON_ASSIGN_OR_RAISE(std::vector> readers, - GetIndexReaders(field_name)); + GetIndexReaders(field_name, row_range_index, cache)); + if (readers.empty()) { + // No usable index for this field within the requested range. Treat as "no + // pushdown available" so the upstream falls back to a full scan instead of + // wrongly producing an empty result. + return std::shared_ptr(nullptr); + } // calculate compound result as field may has multiple indexes std::shared_ptr compound_result; for (const auto& index_reader : readers) { @@ -131,12 +100,13 @@ Result> GlobalIndexEvaluatorImpl::EvaluatePre } Result> GlobalIndexEvaluatorImpl::EvaluateCompoundPredicate( - const std::shared_ptr& compound_predicate) { + const std::shared_ptr& compound_predicate, + const std::optional& row_range_index, ReadersCache& cache) { if (compound_predicate->GetFunction().GetType() == Function::Type::OR) { std::shared_ptr compound_result; for (const auto& child : compound_predicate->Children()) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr sub_result, - EvaluatePredicate(child)); + EvaluatePredicate(child, row_range_index, cache)); if (!sub_result) { return std::shared_ptr(nullptr); } @@ -153,7 +123,7 @@ Result> GlobalIndexEvaluatorImpl::EvaluateCom std::shared_ptr compound_result; for (const auto& child : compound_predicate->Children()) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr sub_result, - EvaluatePredicate(child)); + EvaluatePredicate(child, row_range_index, cache)); if (sub_result) { if (!compound_result) { compound_result = sub_result; diff --git a/src/paimon/core/global_index/global_index_evaluator_impl.h b/src/paimon/core/global_index/global_index_evaluator_impl.h index bf4f1178c..21b14cd8e 100644 --- a/src/paimon/core/global_index/global_index_evaluator_impl.h +++ b/src/paimon/core/global_index/global_index_evaluator_impl.h @@ -27,12 +27,16 @@ #include "paimon/core/schema/table_schema.h" #include "paimon/global_index/global_index_reader.h" #include "paimon/predicate/compound_predicate.h" +#include "paimon/utils/row_range_index.h" namespace paimon { class GlobalIndexEvaluatorImpl : public GlobalIndexEvaluator { public: + /// Creates the underlying readers for the given field, optionally restricted to the + /// provided row range. Returns an empty vector when the field has no usable index. using IndexReadersCreator = - std::function>>(int32_t)>; + std::function>>( + int32_t, const std::optional&)>; GlobalIndexEvaluatorImpl(const std::shared_ptr& table_schema, IndexReadersCreator create_index_readers) @@ -40,28 +44,30 @@ class GlobalIndexEvaluatorImpl : public GlobalIndexEvaluator { Result> Evaluate( const std::shared_ptr& predicate, - const std::shared_ptr& vector_search) override; + const std::optional& row_range_index) override; private: - Result> EvaluateVectorSearch( - const std::shared_ptr& vector_search, - const std::shared_ptr& predicate_result); + /// Per-evaluation cache keyed by field id. Reused across recursive calls within a single + /// `Evaluate` invocation so that the same field is not loaded twice; a fresh cache is used + /// for every public `Evaluate` call because `row_range_index` may change between calls. + using ReadersCache = std::map>>; Result> EvaluatePredicate( - const std::shared_ptr& predicate); + const std::shared_ptr& predicate, + const std::optional& row_range_index, ReadersCache& cache); Result> EvaluateCompoundPredicate( - const std::shared_ptr& compound_predicate); + const std::shared_ptr& compound_predicate, + const std::optional& row_range_index, ReadersCache& cache); Result>> GetIndexReaders( - const std::string& field_name); + const std::string& field_name, const std::optional& row_range_index, + ReadersCache& cache); private: std::shared_ptr table_schema_; - // create_index_readers_(field_id) + // create_index_readers_(field_id, row_range_index) IndexReadersCreator create_index_readers_; - // [field_id, vector] - std::map>> index_readers_cache_; }; } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_scan.cpp b/src/paimon/core/global_index/global_index_scan.cpp index 311f06376..309a4b605 100644 --- a/src/paimon/core/global_index/global_index_scan.cpp +++ b/src/paimon/core/global_index/global_index_scan.cpp @@ -20,7 +20,9 @@ #include "paimon/core/global_index/global_index_scan_impl.h" #include "paimon/core/operation/file_store_scan.h" #include "paimon/core/schema/schema_manager.h" +#include "paimon/core/utils/file_store_path_factory.h" #include "paimon/core/utils/snapshot_manager.h" + namespace paimon { namespace { Result> LoadSchema(const std::string& root_path, @@ -87,8 +89,8 @@ Result> GlobalIndexScan::Create( arrow_schema, partitions.value())); } PAIMON_ASSIGN_OR_RAISE(Snapshot snapshot, LoadSnapshot(root_path, snapshot_id, core_options)); - return std::make_unique(root_path, table_schema, snapshot, - partition_filters, core_options, pool); + return GlobalIndexScanImpl::Create(root_path, table_schema, snapshot, partition_filters, + core_options, pool); } Result> GlobalIndexScan::Create( @@ -109,8 +111,8 @@ Result> GlobalIndexScan::Create( PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, MergeOptions(table_schema, options, file_system)); PAIMON_ASSIGN_OR_RAISE(Snapshot snapshot, LoadSnapshot(root_path, snapshot_id, core_options)); - return std::make_unique(root_path, table_schema, snapshot, - partition_filters, core_options, pool); + return GlobalIndexScanImpl::Create(root_path, table_schema, snapshot, partition_filters, + core_options, pool); } } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_scan_impl.cpp b/src/paimon/core/global_index/global_index_scan_impl.cpp index 89851b4ae..1277fc8cb 100644 --- a/src/paimon/core/global_index/global_index_scan_impl.cpp +++ b/src/paimon/core/global_index/global_index_scan_impl.cpp @@ -16,121 +16,64 @@ #include "paimon/core/global_index/global_index_scan_impl.h" #include -#include #include -#include "paimon/common/executor/future.h" -#include "paimon/core/global_index/row_range_global_index_scanner_impl.h" +#include "paimon/common/global_index/offset_global_index_reader.h" +#include "paimon/common/global_index/union_global_index_reader.h" +#include "paimon/core/global_index/global_index_evaluator_impl.h" #include "paimon/core/index/index_file_handler.h" #include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/global_indexer.h" +#include "paimon/global_index/global_indexer_factory.h" + namespace paimon { -GlobalIndexScanImpl::GlobalIndexScanImpl(const std::string& root_path, - const std::shared_ptr& table_schema, - const Snapshot& snapshot, - const std::shared_ptr& partitions, +GlobalIndexScanImpl::GlobalIndexScanImpl(const std::shared_ptr& table_schema, const CoreOptions& options, + const std::shared_ptr& path_factory, + IndexMetaMap&& index_metas, const std::shared_ptr& pool) : pool_(pool), - root_path_(root_path), table_schema_(table_schema), - snapshot_(snapshot), - partitions_(partitions), - options_(options) {} - -Result> GlobalIndexScanImpl::CreateRangeScan( - const Range& range) { - PAIMON_RETURN_NOT_OK(Scan()); - std::optional partition; - // field id -> {index type -> entry} - std::map>> filtered_entries; - for (const auto& entry : entries_) { - const auto& global_index_meta = entry.index_file->GetGlobalIndexMeta(); - assert(global_index_meta); - const auto& meta = global_index_meta.value(); - if (Range::HasIntersection(range, Range(meta.row_range_start, meta.row_range_end))) { - if (!partition) { - partition = entry.partition; - } else if (!(partition.value() == entry.partition)) { - return Status::Invalid( - "input range contain multiple partitions, fail to create range scan"); - } - filtered_entries[meta.index_field_id][entry.index_file->IndexType()].push_back(entry); - } - } - std::shared_ptr index_file_path_factory = - path_factory_->CreateGlobalIndexFileFactory(); - return std::make_shared(table_schema_, index_file_path_factory, - filtered_entries, options_, pool_); -} - -Result> GlobalIndexScanImpl::GetRowRangeList() { - PAIMON_RETURN_NOT_OK(Scan()); - std::map> index_type_to_ranges; - std::vector index_ranges; - index_ranges.reserve(entries_.size()); - for (const auto& entry : entries_) { - const auto& global_index_meta = entry.index_file->GetGlobalIndexMeta(); - assert(global_index_meta); - const auto& index_meta = global_index_meta.value(); - Range range(index_meta.row_range_start, index_meta.row_range_end); - index_ranges.push_back(range); - index_type_to_ranges[entry.index_file->IndexType()].push_back(range); - } - std::string check_index_type; - std::vector check_ranges; - // check all type index have same shard ranges - // If index a has [1,10],[20,30] and index b has [1,10],[20,25], it's inconsistent, because - // it is hard to handle the [26,30] range. - for (const auto& [type, ranges] : index_type_to_ranges) { - if (check_index_type.empty()) { - check_index_type = type; - check_ranges = Range::SortAndMergeOverlap(ranges, /*adjacent=*/true); - } else { - auto merged = Range::SortAndMergeOverlap(ranges, /*adjacent=*/true); - if (merged != check_ranges) { - return Status::Invalid( - fmt::format("Inconsistent row ranges among index types: {} and {}", - check_index_type, type)); - } - } - } - return Range::SortAndMergeOverlap(index_ranges, /*adjacent=*/false); -} + options_(options), + index_file_manager_( + std::make_shared(options.GetFileSystem(), path_factory)), + index_metas_(std::move(index_metas)) {} -Status GlobalIndexScanImpl::Scan() { - if (initialized_) { - return Status::OK(); - } - auto arrow_schema = DataField::ConvertDataFieldsToArrowSchema(table_schema_->Fields()); - PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, options_.CreateExternalPaths()); +Result> GlobalIndexScanImpl::Create( + const std::string& root_path, const std::shared_ptr& table_schema, + const Snapshot& snapshot, const std::shared_ptr& partitions, + const CoreOptions& options, const std::shared_ptr& pool) { + auto arrow_schema = DataField::ConvertDataFieldsToArrowSchema(table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, options.CreateExternalPaths()); PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, - options_.CreateGlobalIndexExternalPath()); + options.CreateGlobalIndexExternalPath()); PAIMON_ASSIGN_OR_RAISE( - path_factory_, + std::shared_ptr file_store_path_factory, FileStorePathFactory::Create( - root_path_, arrow_schema, table_schema_->PartitionKeys(), - options_.GetPartitionDefaultName(), options_.GetFileFormat()->Identifier(), - options_.DataFilePrefix(), options_.LegacyPartitionNameEnabled(), external_paths, - global_index_external_path, options_.IndexFileInDataFileDir(), pool_)); + root_path, arrow_schema, table_schema->PartitionKeys(), + options.GetPartitionDefaultName(), options.GetFileFormat()->Identifier(), + options.DataFilePrefix(), options.LegacyPartitionNameEnabled(), external_paths, + global_index_external_path, options.IndexFileInDataFileDir(), pool)); + std::shared_ptr path_factory = + file_store_path_factory->CreateGlobalIndexFileFactory(); PAIMON_ASSIGN_OR_RAISE( std::unique_ptr index_manifest_file, - IndexManifestFile::Create(options_.GetFileSystem(), options_.GetManifestFormat(), - options_.GetManifestCompression(), path_factory_, - options_.GetBucket(), pool_, options_)); - auto index_file_handler = - std::make_unique(options_.GetFileSystem(), std::move(index_manifest_file), - std::make_shared(path_factory_), - options_.DeletionVectorsBitmap64(), pool_); + IndexManifestFile::Create(options.GetFileSystem(), options.GetManifestFormat(), + options.GetManifestCompression(), file_store_path_factory, + options.GetBucket(), pool, options)); + auto index_file_handler = std::make_unique( + options.GetFileSystem(), std::move(index_manifest_file), + std::make_shared(file_store_path_factory), + options.DeletionVectorsBitmap64(), pool); PAIMON_ASSIGN_OR_RAISE(std::vector partition_fields, - table_schema_->GetFields(table_schema_->PartitionKeys())); + table_schema->GetFields(table_schema->PartitionKeys())); auto partition_schema = DataField::ConvertDataFieldsToArrowSchema(partition_fields); std::function(const IndexManifestEntry&)> filter = [&](const IndexManifestEntry& entry) -> Result { - if (partitions_) { - PAIMON_ASSIGN_OR_RAISE(bool saved, - partitions_->Test(partition_schema, entry.partition)); + if (partitions) { + PAIMON_ASSIGN_OR_RAISE(bool saved, partitions->Test(partition_schema, entry.partition)); if (!saved) { return false; } @@ -140,76 +83,115 @@ Status GlobalIndexScanImpl::Scan() { } return true; }; - PAIMON_ASSIGN_OR_RAISE(entries_, index_file_handler->Scan(snapshot_, filter)); - initialized_ = true; - return Status::OK(); + PAIMON_ASSIGN_OR_RAISE(std::vector entries, + index_file_handler->Scan(snapshot, filter)); + IndexMetaMap index_metas; + for (const auto& entry : entries) { + auto index_file_meta = entry.index_file; + const auto& index_meta = index_file_meta->GetGlobalIndexMeta(); + assert(index_meta); + Range range(index_meta->row_range_start, index_meta->row_range_end); + index_metas[index_meta->index_field_id][index_file_meta->IndexType()][range].push_back( + index_file_meta); + } + return std::unique_ptr( + new GlobalIndexScanImpl(table_schema, options, path_factory, std::move(index_metas), pool)); } -Result> GlobalIndexScanImpl::ParallelScan( - const std::vector& ranges, const std::shared_ptr& predicate, - const std::shared_ptr& vector_search, const std::shared_ptr& executor) { - std::vector> range_scanners; - range_scanners.reserve(ranges.size()); - for (const auto& range : ranges) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr scanner, - CreateRangeScan(range)); - auto scanner_impl = std::dynamic_pointer_cast(scanner); - if (!scanner_impl) { - return Status::Invalid( - "invalid RowRangeGlobalIndexScanner, fail to cast to " - "RowRangeGlobalIndexScannerImpl"); - } - range_scanners.push_back(scanner_impl); +Result> GlobalIndexScanImpl::GetOrCreateIndexEvaluator() { + if (evaluator_) { + return evaluator_; } + GlobalIndexEvaluatorImpl::IndexReadersCreator create_index_readers = + [this](int32_t field_id, const std::optional& row_range_index) + -> Result>> { + return CreateReaders(field_id, row_range_index); + }; + evaluator_ = std::make_shared(table_schema_, create_index_readers); + return evaluator_; +} - std::vector>>> futures; - for (size_t i = 0; i < range_scanners.size(); i++) { - const auto& scanner = range_scanners[i]; - const auto& range = ranges[i]; - auto search_index = [&scanner, &predicate, &vector_search, - &range]() -> Result> { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr evaluator, - scanner->CreateIndexEvaluator()); - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr index_result, - evaluator->Evaluate(predicate, vector_search)); - if (!index_result) { - return index_result; - } - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result_with_offset, - index_result->AddOffset(range.from)); - return result_with_offset; - }; - futures.push_back(Via(executor.get(), search_index)); +Result>> GlobalIndexScanImpl::CreateReaders( + int32_t field_id, const std::optional& row_range_index) const { + PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema_->GetField(field_id)); + return CreateReaders(field, row_range_index); +} + +Result>> GlobalIndexScanImpl::CreateReaders( + const std::string& field_name, const std::optional& row_range_index) const { + PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema_->GetField(field_name)); + return CreateReaders(field, row_range_index); +} + +Result>> GlobalIndexScanImpl::CreateReaders( + const DataField& field, const std::optional& row_range_index) const { + auto field_iter = index_metas_.find(field.Id()); + if (field_iter == index_metas_.end()) { + return std::vector>(); } - auto collected_results = CollectAll(futures); + const auto& index_type_to_metas = field_iter->second; + std::vector> readers; + readers.reserve(index_type_to_metas.size()); + for (const auto& [index_type, range_to_metas] : index_type_to_metas) { + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr indexer, + GlobalIndexerFactory::Get(index_type, options_.ToMap())); + if (!indexer) { + continue; + } + std::vector> union_readers; + union_readers.reserve(range_to_metas.size()); + for (const auto& [range, metas] : range_to_metas) { + if (row_range_index && !row_range_index->Intersects(range.from, range.to)) { + continue; + } + // TODO(xinyu.lxy): c_arrow_schema may contains additional associated fields. + auto arrow_field = DataField::ConvertDataFieldToArrowField(field); + auto arrow_schema = arrow::schema({arrow_field}); - // collect inner result and check all null - bool all_null = true; - std::vector> results; - for (auto& result : collected_results) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr inner_result, result); - if (inner_result) { - all_null = false; + ArrowSchema c_arrow_schema; + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportSchema(*arrow_schema, &c_arrow_schema)); + auto index_io_metas = ToGlobalIndexIOMetas(metas); + ScopeGuard guard([&]() { ArrowSchemaRelease(&c_arrow_schema); }); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr index_reader, + indexer->CreateReader(&c_arrow_schema, index_file_manager_, index_io_metas, pool_)); + union_readers.push_back( + std::make_shared(std::move(index_reader), range.from)); } - results.push_back(std::move(inner_result)); + if (union_readers.empty()) { + continue; + } + // TODO(lisizhuo.lsz): add executor in UnionGlobalIndexReader + readers.push_back(std::make_shared(std::move(union_readers), + /*executor=*/nullptr)); } - if (all_null) { - return std::shared_ptr(nullptr); + return readers; +} + +std::vector GlobalIndexScanImpl::ToGlobalIndexIOMetas( + const std::vector>& metas) const { + std::vector index_io_metas; + index_io_metas.reserve(metas.size()); + for (const auto& meta : metas) { + index_io_metas.push_back(ToGlobalIndexIOMeta(meta)); } + return index_io_metas; +} - // union result from multiple ranges - std::shared_ptr final_global_index_result; +GlobalIndexIOMeta GlobalIndexScanImpl::ToGlobalIndexIOMeta( + const std::shared_ptr& index_meta) const { + assert(index_meta->GetGlobalIndexMeta()); + const auto& global_index_meta = index_meta->GetGlobalIndexMeta().value(); + return {index_file_manager_->ToPath(index_meta), index_meta->FileSize(), + global_index_meta.index_meta}; +} - for (size_t i = 0; i < results.size(); ++i) { - std::shared_ptr result = - results[i] ? results[i] : BitmapGlobalIndexResult::FromRanges({ranges[i]}); - if (!final_global_index_result) { - final_global_index_result = result; - } else { - PAIMON_ASSIGN_OR_RAISE(final_global_index_result, - final_global_index_result->Or(result)); - } - } - return final_global_index_result; +Result> GlobalIndexScanImpl::Scan( + const std::shared_ptr& predicate, + const std::optional& row_range_index) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr evaluator, + GetOrCreateIndexEvaluator()); + return evaluator->Evaluate(predicate, row_range_index); } + } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_scan_impl.h b/src/paimon/core/global_index/global_index_scan_impl.h index 1f7e23cd0..c4b0d5f0c 100644 --- a/src/paimon/core/global_index/global_index_scan_impl.h +++ b/src/paimon/core/global_index/global_index_scan_impl.h @@ -23,48 +23,62 @@ #include "paimon/common/predicate/predicate_filter.h" #include "paimon/core/core_options.h" -#include "paimon/core/manifest/index_manifest_entry.h" +#include "paimon/core/global_index/global_index_evaluator.h" +#include "paimon/core/global_index/global_index_file_manager.h" +#include "paimon/core/index/index_file_meta.h" +#include "paimon/core/index/index_path_factory.h" #include "paimon/core/schema/table_schema.h" #include "paimon/core/snapshot.h" -#include "paimon/core/utils/file_store_path_factory.h" -#include "paimon/core/utils/snapshot_manager.h" +#include "paimon/global_index/global_index_io_meta.h" #include "paimon/global_index/global_index_scan.h" namespace paimon { class GlobalIndexScanImpl : public GlobalIndexScan { public: - GlobalIndexScanImpl(const std::string& root_path, - const std::shared_ptr& table_schema, const Snapshot& snapshot, - const std::shared_ptr& partitions, - const CoreOptions& options, const std::shared_ptr& pool); + static Result> Create( + const std::string& root_path, const std::shared_ptr& table_schema, + const Snapshot& snapshot, const std::shared_ptr& partitions, + const CoreOptions& options, const std::shared_ptr& pool); - Result> CreateRangeScan( - const Range& range) override; + Result> Scan( + const std::shared_ptr& predicate, + const std::optional& row_range_index); - Result> GetRowRangeList() override; + Result>> CreateReaders( + int32_t field_id, const std::optional& row_range_index) const; - const Snapshot& GetSnapshot() const { - return snapshot_; - } - - Result> ParallelScan( - const std::vector& ranges, const std::shared_ptr& predicate, - const std::shared_ptr& vector_search, - const std::shared_ptr& executor); + Result>> CreateReaders( + const std::string& field_name, const std::optional& row_range_index) const; private: - Status Scan(); + /// (id->index_type->row_range) -> index meta list + using IndexMetaMap = + std::map>>>>; + + GlobalIndexScanImpl(const std::shared_ptr& table_schema, + const CoreOptions& options, + const std::shared_ptr& path_factory, + IndexMetaMap&& index_metas, const std::shared_ptr& pool); + + Result> GetOrCreateIndexEvaluator(); + + Result>> CreateReaders( + const DataField& field, const std::optional& row_range_index) const; + + std::vector ToGlobalIndexIOMetas( + const std::vector>& metas) const; + + GlobalIndexIOMeta ToGlobalIndexIOMeta(const std::shared_ptr& index_meta) const; private: - bool initialized_ = false; std::shared_ptr pool_; std::string root_path_; std::shared_ptr table_schema_; - Snapshot snapshot_; - std::shared_ptr partitions_; CoreOptions options_; - std::shared_ptr path_factory_; - std::vector entries_; + std::shared_ptr index_file_manager_; + IndexMetaMap index_metas_; + std::shared_ptr evaluator_; }; } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_write_task.cpp b/src/paimon/core/global_index/global_index_write_task.cpp index cd0e3c6e7..5ee425f86 100644 --- a/src/paimon/core/global_index/global_index_write_task.cpp +++ b/src/paimon/core/global_index/global_index_write_task.cpp @@ -123,7 +123,12 @@ Result> BuildIndex(const std::string& field_name, std::vector relative_row_ids; relative_row_ids.reserve(typed_row_id_array->length()); for (int64_t i = 0; i < typed_row_id_array->length(); i++) { - relative_row_ids.push_back(typed_row_id_array->Value(i) - range.from); + int64_t row_id = typed_row_id_array->Value(i); + if (row_id < range.from || row_id > range.to) { + return Status::Invalid("invalid row id {}, out of range [{}, {}]", row_id, + range.from, range.to); + } + relative_row_ids.push_back(row_id - range.from); } PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr new_array, arrow::StructArray::Make({indexed_array}, {field_name})); diff --git a/src/paimon/core/global_index/row_range_global_index_scanner_impl.cpp b/src/paimon/core/global_index/row_range_global_index_scanner_impl.cpp deleted file mode 100644 index 846a3bd53..000000000 --- a/src/paimon/core/global_index/row_range_global_index_scanner_impl.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/core/global_index/row_range_global_index_scanner_impl.h" - -#include -#include -#include -#include - -#include "arrow/c/bridge.h" -#include "arrow/c/helpers.h" -#include "paimon/common/utils/scope_guard.h" -#include "paimon/core/global_index/global_index_evaluator_impl.h" -#include "paimon/global_index/global_indexer.h" -#include "paimon/global_index/global_indexer_factory.h" -namespace paimon { -RowRangeGlobalIndexScannerImpl::RowRangeGlobalIndexScannerImpl( - const std::shared_ptr& table_schema, - const std::shared_ptr& path_factory, - const RowRangeGlobalIndexScannerImpl::IndexManifestEntryGroup& grouped_entries, - const CoreOptions& options, const std::shared_ptr& pool) - : pool_(pool), - table_schema_(table_schema), - options_(options), - grouped_entries_(grouped_entries), - index_file_manager_( - std::make_shared(options.GetFileSystem(), path_factory)) {} - -Result> RowRangeGlobalIndexScannerImpl::CreateIndexEvaluator() - const { - GlobalIndexEvaluatorImpl::IndexReadersCreator create_index_readers = - [scanner = shared_from_this()]( - int32_t field_id) -> Result>> { - return scanner->CreateReaders(field_id); - }; - return std::make_shared(table_schema_, create_index_readers); -} - -Result> RowRangeGlobalIndexScannerImpl::CreateReader( - const std::string& field_name, const std::string& index_type) const { - PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema_->GetField(field_name)); - auto field_iter = grouped_entries_.find(field.Id()); - if (field_iter == grouped_entries_.end()) { - return std::shared_ptr(); - } - const auto& index_type_to_entries = field_iter->second; - auto entry_iter = index_type_to_entries.find(index_type); - if (entry_iter == index_type_to_entries.end()) { - return std::shared_ptr(); - } - const auto& entries = entry_iter->second; - return CreateReader(field, index_type, entries); -} - -Result>> -RowRangeGlobalIndexScannerImpl::CreateReaders(const std::string& field_name) const { - PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema_->GetField(field_name)); - return CreateReaders(field); -} - -Result>> -RowRangeGlobalIndexScannerImpl::CreateReaders(int32_t field_id) const { - PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema_->GetField(field_id)); - return CreateReaders(field); -} - -Result>> -RowRangeGlobalIndexScannerImpl::CreateReaders(const DataField& field) const { - auto field_iter = grouped_entries_.find(field.Id()); - if (field_iter == grouped_entries_.end()) { - return std::vector>(); - } - const auto& index_type_to_entries = field_iter->second; - std::vector> readers; - readers.reserve(index_type_to_entries.size()); - for (const auto& [index_type, entries] : index_type_to_entries) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr reader, - CreateReader(field, index_type, entries)); - if (reader) { - readers.push_back(std::move(reader)); - } - } - return readers; -} - -Result> RowRangeGlobalIndexScannerImpl::CreateReader( - const DataField& field, const std::string& index_type, - const std::vector& entries) const { - PAIMON_ASSIGN_OR_RAISE(std::unique_ptr indexer, - GlobalIndexerFactory::Get(index_type, options_.ToMap())); - if (!indexer) { - return std::shared_ptr(); - } - // TODO(xinyu.lxy): c_arrow_schema may contains additional associated fields. - auto arrow_field = DataField::ConvertDataFieldToArrowField(field); - auto arrow_schema = arrow::schema({arrow_field}); - - ArrowSchema c_arrow_schema; - PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportSchema(*arrow_schema, &c_arrow_schema)); - auto index_io_metas = ToGlobalIndexIOMetas(entries); - ScopeGuard guard([&]() { ArrowSchemaRelease(&c_arrow_schema); }); - return indexer->CreateReader(&c_arrow_schema, index_file_manager_, index_io_metas, pool_); -} - -std::vector RowRangeGlobalIndexScannerImpl::ToGlobalIndexIOMetas( - const std::vector& entries) const { - std::vector index_io_metas; - index_io_metas.reserve(entries.size()); - for (const auto& entry : entries) { - index_io_metas.push_back(ToGlobalIndexIOMeta(entry)); - } - return index_io_metas; -} - -GlobalIndexIOMeta RowRangeGlobalIndexScannerImpl::ToGlobalIndexIOMeta( - const IndexManifestEntry& entry) const { - const auto& index_file = entry.index_file; - assert(index_file->GetGlobalIndexMeta()); - const auto& global_index_meta = index_file->GetGlobalIndexMeta().value(); - return {index_file_manager_->ToPath(index_file), index_file->FileSize(), - global_index_meta.index_meta}; -} - -} // namespace paimon diff --git a/src/paimon/core/global_index/row_range_global_index_scanner_impl.h b/src/paimon/core/global_index/row_range_global_index_scanner_impl.h deleted file mode 100644 index 41b26ea05..000000000 --- a/src/paimon/core/global_index/row_range_global_index_scanner_impl.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include "paimon/core/core_options.h" -#include "paimon/core/global_index/global_index_evaluator.h" -#include "paimon/core/global_index/global_index_file_manager.h" -#include "paimon/core/manifest/index_manifest_entry.h" -#include "paimon/core/schema/table_schema.h" -#include "paimon/global_index/global_index_io_meta.h" -#include "paimon/global_index/row_range_global_index_scanner.h" -namespace paimon { -class RowRangeGlobalIndexScannerImpl - : public RowRangeGlobalIndexScanner, - public std::enable_shared_from_this { - public: - using IndexManifestEntryGroup = - std::map>>; - - RowRangeGlobalIndexScannerImpl(const std::shared_ptr& table_schema, - const std::shared_ptr& path_factory, - const IndexManifestEntryGroup& grouped_entries, - const CoreOptions& options, - const std::shared_ptr& pool); - - Result> CreateIndexEvaluator() const; - - /// @return nullptr if global index reader not exist or plugin mismatch - Result> CreateReader( - const std::string& field_name, const std::string& index_type) const override; - - Result>> CreateReaders( - const std::string& field_name) const override; - - private: - Result>> CreateReaders(int32_t field_id) const; - Result>> CreateReaders( - const DataField& field) const; - - Result> CreateReader( - const DataField& field, const std::string& index_type, - const std::vector& entries) const; - - std::vector ToGlobalIndexIOMetas( - const std::vector& entries) const; - - GlobalIndexIOMeta ToGlobalIndexIOMeta(const IndexManifestEntry& entry) const; - - private: - std::shared_ptr pool_; - std::shared_ptr table_schema_; - CoreOptions options_; - IndexManifestEntryGroup grouped_entries_; - std::shared_ptr index_file_manager_; -}; - -} // namespace paimon diff --git a/src/paimon/core/operation/abstract_file_store_write.cpp b/src/paimon/core/operation/abstract_file_store_write.cpp index 2ef4c8215..712dbfe75 100644 --- a/src/paimon/core/operation/abstract_file_store_write.cpp +++ b/src/paimon/core/operation/abstract_file_store_write.cpp @@ -286,8 +286,7 @@ Result> AbstractFileStoreWrite::ScanExistingFileMe partition_filters.push_back(part_values_map); } auto scan_filter = std::make_shared( - /*predicate=*/nullptr, partition_filters, std::optional(bucket), - /*vector_search=*/nullptr); + /*predicate=*/nullptr, partition_filters, std::optional(bucket)); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr scan, CreateFileStoreScan(scan_filter)); std::shared_ptr index_file_handler; diff --git a/src/paimon/core/operation/data_evolution_file_store_scan.cpp b/src/paimon/core/operation/data_evolution_file_store_scan.cpp index 07ded98b2..e9bb68a68 100644 --- a/src/paimon/core/operation/data_evolution_file_store_scan.cpp +++ b/src/paimon/core/operation/data_evolution_file_store_scan.cpp @@ -26,9 +26,9 @@ #include "paimon/common/utils/range_helper.h" namespace paimon { Result DataEvolutionFileStoreScan::FilterEntryByRowRanges( - const ManifestEntry& entry, const std::optional>& row_ranges) { - // If row ranges is null, all entries should be kept - if (!row_ranges) { + const ManifestEntry& entry, const std::optional& row_range_index) { + // If row range index is null, all entries should be kept + if (!row_range_index) { return true; } // If firstRowId does not exist, keep the entry @@ -39,34 +39,12 @@ Result DataEvolutionFileStoreScan::FilterEntryByRowRanges( // Check if any value in indices is in the range [firstRowId, firstRowId + rowCount - 1] int64_t end_row_id = first_row_id.value() + entry.File()->row_count - 1; - Range file_range(first_row_id.value(), end_row_id); - for (const auto& row_range : row_ranges.value()) { - if (Range::HasIntersection(file_range, row_range)) { - return true; - } - } - // No matching indices found, skip this entry - return false; + return row_range_index->Intersects(first_row_id.value(), end_row_id); } Result DataEvolutionFileStoreScan::FilterByStats(const ManifestEntry& entry) const { - return FilterEntryByRowRanges(entry, row_ranges_); -} - -std::vector DataEvolutionFileStoreScan::PostFilterManifests( - std::vector&& manifests) const { - if (!row_ranges_) { - return std::move(manifests); - } - std::vector result_metas; - result_metas.reserve(manifests.size()); - for (auto& manifest : manifests) { - if (FilterManifestByRowRanges(manifest, row_ranges_)) { - result_metas.push_back(std::move(manifest)); - } - } - return result_metas; + return FilterEntryByRowRanges(entry, row_range_index_); } Result> DataEvolutionFileStoreScan::PostFilterManifestEntries( @@ -101,26 +79,6 @@ Result> DataEvolutionFileStoreScan::PostFilterManifes return result_entries; } -bool DataEvolutionFileStoreScan::FilterManifestByRowRanges( - const ManifestFileMeta& manifest, const std::optional>& row_ranges) { - if (!row_ranges) { - return true; - } - std::optional min = manifest.MinRowId(); - std::optional max = manifest.MaxRowId(); - if (!min || !max) { - return true; - } - - Range manifest_range(min.value(), max.value()); - for (const auto& range : row_ranges.value()) { - if (Range::HasIntersection(manifest_range, range)) { - return true; - } - } - return false; -} - Result DataEvolutionFileStoreScan::FilterByStatsWithSameRowId( const std::vector& entries) const { if (entries.empty()) { diff --git a/src/paimon/core/operation/data_evolution_file_store_scan.h b/src/paimon/core/operation/data_evolution_file_store_scan.h index 7c9fc6df9..9dbd2cddd 100644 --- a/src/paimon/core/operation/data_evolution_file_store_scan.h +++ b/src/paimon/core/operation/data_evolution_file_store_scan.h @@ -64,9 +64,6 @@ class DataEvolutionFileStoreScan : public FileStoreScan { return scan; } - std::vector PostFilterManifests( - std::vector&& manifests) const override; - Result> PostFilterManifestEntries( std::vector&& entries) const override; @@ -88,10 +85,8 @@ class DataEvolutionFileStoreScan : public FileStoreScan { Result FilterByStatsWithSameRowId(const std::vector& entries) const; - static bool FilterManifestByRowRanges(const ManifestFileMeta& manifest, - const std::optional>& row_ranges); static Result FilterEntryByRowRanges(const ManifestEntry& entry, - const std::optional>& row_ranges); + const std::optional& row_range_index); static Result> EvolutionStats( const std::vector& entries, const std::shared_ptr& table_schema, diff --git a/src/paimon/core/operation/data_evolution_file_store_scan_test.cpp b/src/paimon/core/operation/data_evolution_file_store_scan_test.cpp index 12e6681ab..49b8e8903 100644 --- a/src/paimon/core/operation/data_evolution_file_store_scan_test.cpp +++ b/src/paimon/core/operation/data_evolution_file_store_scan_test.cpp @@ -558,15 +558,9 @@ TEST_F(DataEvolutionFileStoreScanTest, TestFilterEntryByRowRanges) { { // row_ids is null ASSERT_OK_AND_ASSIGN(bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges( - entry, /*row_ranges=*/std::nullopt)); + entry, /*row_range_index=*/std::nullopt)); ASSERT_TRUE(exist); } - { - // row_ids is empty - ASSERT_OK_AND_ASSIGN(bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges( - entry, /*row_ranges=*/std::vector())); - ASSERT_FALSE(exist); - } { auto file_without_first_row_id = std::make_shared( "data-0.orc", /*file_size=*/645, @@ -582,59 +576,37 @@ TEST_F(DataEvolutionFileStoreScanTest, TestFilterEntryByRowRanges) { ManifestEntry entry_without_first_row_id(FileKind::Add(), BinaryRow::EmptyRow(), /*bucket=*/0, /*total_buckets=*/1, file_without_first_row_id); + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, + RowRangeIndex::Create(std::vector({Range(0l, 0l)}))); // first row id is null - ASSERT_OK_AND_ASSIGN( - bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges( - entry_without_first_row_id, - /*row_ranges=*/std::optional>({Range(0l, 0l)}))); + ASSERT_OK_AND_ASSIGN(bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges( + entry_without_first_row_id, row_range_index)); ASSERT_TRUE(exist); } { - ASSERT_OK_AND_ASSIGN(bool exist, - DataEvolutionFileStoreScan::FilterEntryByRowRanges( - entry, /*row_ranges=*/std::optional>( - {Range(0l, 0l), Range(10l, 10l)}))); + ASSERT_OK_AND_ASSIGN( + RowRangeIndex row_range_index, + RowRangeIndex::Create(std::vector({Range(0l, 0l), Range(10l, 10l)}))); + ASSERT_OK_AND_ASSIGN( + bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges(entry, row_range_index)); ASSERT_FALSE(exist); } { - ASSERT_OK_AND_ASSIGN(bool exist, - DataEvolutionFileStoreScan::FilterEntryByRowRanges( - entry, /*row_ranges=*/std::optional>( - {Range(0l, 0l), Range(101l, 101l)}))); + ASSERT_OK_AND_ASSIGN( + RowRangeIndex row_range_index, + RowRangeIndex::Create(std::vector({Range(0l, 0l), Range(101l, 101l)}))); + ASSERT_OK_AND_ASSIGN( + bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges(entry, row_range_index)); ASSERT_TRUE(exist); } { - ASSERT_OK_AND_ASSIGN(bool exist, - DataEvolutionFileStoreScan::FilterEntryByRowRanges( - entry, /*row_ranges=*/std::optional>( - {Range(100l, 100l), Range(189l, 189l)}))); + ASSERT_OK_AND_ASSIGN( + RowRangeIndex row_range_index, + RowRangeIndex::Create(std::vector({Range(100l, 100l), Range(189l, 189l)}))); + ASSERT_OK_AND_ASSIGN( + bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges(entry, row_range_index)); ASSERT_TRUE(exist); } } -TEST_F(DataEvolutionFileStoreScanTest, TestFilterManifestByRowRanges) { - // row id [10, 20] - auto manifest1 = - ManifestFileMeta("manifest-65b0d403-a1bc-4157-b242-bff73c46596d-0", /*file_size=*/2779, - /*num_added_files=*/1, /*num_deleted_files=*/0, SimpleStats::EmptyStats(), - /*schema_id=*/0, /*min_bucket=*/0, /*max_bucket=*/0, - /*min_level=*/0, /*max_level=*/0, - /*min_row_id=*/10, /*max_row_id=*/20); - ASSERT_TRUE(DataEvolutionFileStoreScan::FilterManifestByRowRanges(manifest1, std::nullopt)); - ASSERT_FALSE( - DataEvolutionFileStoreScan::FilterManifestByRowRanges(manifest1, std::vector())); - ASSERT_TRUE(DataEvolutionFileStoreScan::FilterManifestByRowRanges( - manifest1, std::optional>({Range(0, 15), Range(100, 200)}))); - ASSERT_FALSE(DataEvolutionFileStoreScan::FilterManifestByRowRanges( - manifest1, std::optional>({Range(0, 5), Range(100, 200)}))); - - auto manifest2 = - ManifestFileMeta("manifest-65b0d403-a1bc-4157-b242-bff73c46596d-0", /*file_size=*/2779, - /*num_added_files=*/1, /*num_deleted_files=*/0, SimpleStats::EmptyStats(), - /*schema_id=*/0, /*min_bucket=*/0, /*max_bucket=*/0, - /*min_level=*/0, /*max_level=*/0, - /*min_row_id=*/std::nullopt, /*max_row_id=*/std::nullopt); - ASSERT_TRUE(DataEvolutionFileStoreScan::FilterManifestByRowRanges( - manifest2, std::optional>({Range(0, 0)}))); -} } // namespace paimon::test diff --git a/src/paimon/core/operation/file_store_commit_impl.cpp b/src/paimon/core/operation/file_store_commit_impl.cpp index 87bcfbcd0..c0e545c34 100644 --- a/src/paimon/core/operation/file_store_commit_impl.cpp +++ b/src/paimon/core/operation/file_store_commit_impl.cpp @@ -313,9 +313,8 @@ Result FileStoreCommitImpl::GetLastCommitTableRequest() { Result> FileStoreCommitImpl::GetAllFiles( const Snapshot& snapshot, const std::vector>& partitions) { - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, partitions, - /*bucket_filter=*/std::nullopt, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, partitions, + /*bucket_filter=*/std::nullopt); PAIMON_ASSIGN_OR_RAISE( auto scan, AppendOnlyFileStoreScan::Create( snapshot_manager_, schema_manager_, manifest_list_, manifest_file_, @@ -519,9 +518,8 @@ Result> FileStoreCommitImpl::ReadAllEntriesFromChange const std::set>& partitions) const { std::vector> partition_filters(partitions.begin(), partitions.end()); - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, partition_filters, - /*bucket_filter=*/std::nullopt, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, partition_filters, + /*bucket_filter=*/std::nullopt); PAIMON_ASSIGN_OR_RAISE( auto scan, AppendOnlyFileStoreScan::Create( snapshot_manager_, schema_manager_, manifest_list_, manifest_file_, diff --git a/src/paimon/core/operation/file_store_scan.cpp b/src/paimon/core/operation/file_store_scan.cpp index 78d639a83..925a3afce 100644 --- a/src/paimon/core/operation/file_store_scan.cpp +++ b/src/paimon/core/operation/file_store_scan.cpp @@ -130,7 +130,6 @@ Result> FileStoreScan::CreatePlan() cons std::vector filtered_manifest_file_metas; PAIMON_RETURN_NOT_OK( ReadManifests(&snapshot, &all_manifest_file_metas, &filtered_manifest_file_metas)); - filtered_manifest_file_metas = PostFilterManifests(std::move(filtered_manifest_file_metas)); std::vector manifest_entries; PAIMON_RETURN_NOT_OK(ReadManifestEntries(filtered_manifest_file_metas, &manifest_entries)); @@ -288,14 +287,31 @@ Result FileStoreScan::FilterManifestFileMeta(const ManifestFileMeta& manif } } // filter by partition filter - if (!partition_filter_) { + + if (partition_filter_) { + SimpleStats stats = manifest.PartitionStats(); + PAIMON_ASSIGN_OR_RAISE( + bool saved, partition_filter_->Test( + partition_schema_, + /*row_count=*/manifest.NumAddedFiles() + manifest.NumDeletedFiles(), + stats.MinValues(), stats.MaxValues(), stats.NullCounts())); + if (!saved) { + return false; + } + } + return FilterManifestByRowRanges(manifest); +} + +bool FileStoreScan::FilterManifestByRowRanges(const ManifestFileMeta& manifest) const { + if (!row_range_index_) { + return true; + } + std::optional min = manifest.MinRowId(); + std::optional max = manifest.MaxRowId(); + if (!min || !max) { return true; } - SimpleStats stats = manifest.PartitionStats(); - return partition_filter_->Test( - partition_schema_, - /*row_count=*/manifest.NumAddedFiles() + manifest.NumDeletedFiles(), stats.MinValues(), - stats.MaxValues(), stats.NullCounts()); + return row_range_index_->Intersects(min.value(), max.value()); } Status FileStoreScan::ReadManifestFileMeta(const ManifestFileMeta& manifest, diff --git a/src/paimon/core/operation/file_store_scan.h b/src/paimon/core/operation/file_store_scan.h index e55f07620..46a06e4a7 100644 --- a/src/paimon/core/operation/file_store_scan.h +++ b/src/paimon/core/operation/file_store_scan.h @@ -50,6 +50,7 @@ #include "paimon/result.h" #include "paimon/scan_context.h" #include "paimon/status.h" +#include "paimon/utils/row_range_index.h" namespace arrow { class Schema; @@ -114,8 +115,8 @@ class FileStoreScan { return this; } - FileStoreScan* WithRowRanges(const std::vector& row_ranges) { - row_ranges_ = row_ranges; + FileStoreScan* WithRowRangeIndex(const RowRangeIndex& row_range_index) { + row_range_index_ = row_range_index; return this; } @@ -194,11 +195,6 @@ class FileStoreScan { /// @note Keep this thread-safe. virtual Result FilterByStats(const ManifestEntry& entry) const = 0; - virtual std::vector PostFilterManifests( - std::vector&& manifests) const { - return std::move(manifests); - } - virtual Result> PostFilterManifestEntries( std::vector&& entries) const { return std::move(entries); @@ -252,6 +248,8 @@ class FileStoreScan { Result FilterManifestFileMeta(const ManifestFileMeta& manifest) const; + bool FilterManifestByRowRanges(const ManifestFileMeta& manifest) const; + Status ReadManifestFileMeta(const ManifestFileMeta& manifest, std::vector* entries) const; @@ -261,7 +259,7 @@ class FileStoreScan { std::shared_ptr predicates_; std::shared_ptr schema_; std::shared_ptr table_schema_; - std::optional> row_ranges_; + std::optional row_range_index_; ScanMode scan_mode_ = ScanMode::ALL; CoreOptions core_options_; diff --git a/src/paimon/core/operation/key_value_file_store_scan_test.cpp b/src/paimon/core/operation/key_value_file_store_scan_test.cpp index 569558ee6..35bbfe078 100644 --- a/src/paimon/core/operation/key_value_file_store_scan_test.cpp +++ b/src/paimon/core/operation/key_value_file_store_scan_test.cpp @@ -138,10 +138,9 @@ TEST_F(KeyValueFileStoreScanTest, TestMaxSequenceNumber) { std::string table_path = paimon::test::GetDataDir() + "orc/pk_table_with_dv_cardinality.db/pk_table_with_dv_cardinality"; std::vector> partition_filters = {{{"f1", "10"}}}; - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, - /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, + /*partition_filters=*/partition_filters, + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/2)); @@ -182,10 +181,9 @@ TEST_F(KeyValueFileStoreScanTest, TestMaxSequenceNumber) { "orc/pk_table_with_dv_cardinality.db/" "pk_table_with_dv_cardinality"; std::vector> partition_filters = {{{"f1", "10"}}}; - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, - /*partition_filters=*/partition_filters, - /*bucket_filter=*/1, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, + /*partition_filters=*/partition_filters, + /*bucket_filter=*/1); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/4)); @@ -200,10 +198,9 @@ TEST_F(KeyValueFileStoreScanTest, TestMaxSequenceNumber) { paimon::test::GetDataDir() + "orc/pk_table_with_mor.db/pk_table_with_mor"; std::vector> partition_filters = { {{"p0", "1"}, {"p1", "0"}}}; - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, - /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, + /*partition_filters=*/partition_filters, + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/1)); @@ -218,10 +215,9 @@ TEST_F(KeyValueFileStoreScanTest, TestMaxSequenceNumber) { paimon::test::GetDataDir() + "orc/pk_table_with_mor.db/pk_table_with_mor"; std::vector> partition_filters = { {{"p0", "0"}, {"p1", "0"}}}; - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, - /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, + /*partition_filters=*/partition_filters, + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/2)); @@ -235,10 +231,9 @@ TEST_F(KeyValueFileStoreScanTest, TestMaxSequenceNumber) { std::string table_path = paimon::test::GetDataDir() + "orc/pk_table_partial_update.db/pk_table_partial_update"; std::vector> partition_filters = {}; - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, - /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, + /*partition_filters=*/partition_filters, + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/2)); @@ -255,7 +250,7 @@ TEST_F(KeyValueFileStoreScanTest, TestScanDurationMetric) { std::vector> partition_filters = {{{"f1", "10"}}}; auto scan_filter = std::make_shared(/*predicate=*/nullptr, /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, /*vector_search=*/nullptr); + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/2)); @@ -307,7 +302,7 @@ TEST_F(KeyValueFileStoreScanTest, TestSplitAndSetKeyValueFilter) { PredicateBuilder::And({not_equal, equal, greater_than, less_than})); auto scan_filter = std::make_shared(/*predicate=*/predicate, /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, /*vector_search=*/nullptr); + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/1)); @@ -377,8 +372,7 @@ TEST_F(KeyValueFileStoreScanTest, TestFilterByValueFilterWithValueStatsCols) { FieldType::DOUBLE, Literal(30.1)); auto scan_filter = std::make_shared(/*predicate=*/greater_than, /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, - /*vector_search=*/nullptr); + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/1)); diff --git a/src/paimon/core/operation/scan_context.cpp b/src/paimon/core/operation/scan_context.cpp index 7c6cd2cfb..ec60dfa4f 100644 --- a/src/paimon/core/operation/scan_context.cpp +++ b/src/paimon/core/operation/scan_context.cpp @@ -56,7 +56,6 @@ class ScanContextBuilder::Impl { bucket_filter_ = std::nullopt; partition_filters_.clear(); predicates_.reset(); - vector_search_.reset(); global_index_result_.reset(); memory_pool_ = GetDefaultPool(); executor_ = CreateDefaultExecutor(); @@ -71,7 +70,6 @@ class ScanContextBuilder::Impl { std::optional bucket_filter_; std::vector> partition_filters_; std::shared_ptr predicates_; - std::shared_ptr vector_search_; std::shared_ptr global_index_result_; std::shared_ptr memory_pool_ = GetDefaultPool(); std::shared_ptr executor_ = CreateDefaultExecutor(); @@ -110,12 +108,6 @@ ScanContextBuilder& ScanContextBuilder::SetPredicate(const std::shared_ptr& vector_search) { - impl_->vector_search_ = vector_search; - return *this; -} - ScanContextBuilder& ScanContextBuilder::SetGlobalIndexResult( const std::shared_ptr& global_index_result) { impl_->global_index_result_ = global_index_result; @@ -159,7 +151,7 @@ Result> ScanContextBuilder::Finish() { auto ctx = std::make_unique( impl_->path_, impl_->is_streaming_mode_, impl_->limit_, std::make_shared(impl_->predicates_, impl_->partition_filters_, - impl_->bucket_filter_, impl_->vector_search_), + impl_->bucket_filter_), impl_->global_index_result_, impl_->memory_pool_, impl_->executor_, impl_->specific_file_system_, impl_->options_); impl_->Reset(); diff --git a/src/paimon/core/operation/scan_context_test.cpp b/src/paimon/core/operation/scan_context_test.cpp index 89b2e2ac5..ee0f77153 100644 --- a/src/paimon/core/operation/scan_context_test.cpp +++ b/src/paimon/core/operation/scan_context_test.cpp @@ -36,7 +36,6 @@ TEST(ScanContextTest, TestSimple) { ASSERT_TRUE(ctx->GetScanFilters()); ASSERT_FALSE(ctx->GetScanFilters()->GetBucketFilter()); ASSERT_FALSE(ctx->GetScanFilters()->GetPredicate()); - ASSERT_FALSE(ctx->GetScanFilters()->GetVectorSearch()); ASSERT_TRUE(ctx->GetScanFilters()->GetPartitionFilters().empty()); ASSERT_FALSE(ctx->GetGlobalIndexResult()); ASSERT_FALSE(ctx->GetSpecificFileSystem()); @@ -50,11 +49,6 @@ TEST(ScanContextTest, TestSetFilter) { auto predicate = PredicateBuilder::IsNull(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT); builder.SetPredicate(predicate); - std::vector query = {1.0, 2.0}; - VectorSearch::PreFilter pre_filter = [](int64_t id) -> bool { return id % 2; }; - builder.SetVectorSearch(std::make_shared( - "f0", 10, query, pre_filter, nullptr, VectorSearch::DistanceType::INNER_PRODUCT, - std::map())); std::vector row_ranges = {Range(1, 2), Range(4, 5)}; auto global_index_result = BitmapGlobalIndexResult::FromRanges(row_ranges); builder.SetGlobalIndexResult(global_index_result); @@ -70,11 +64,6 @@ TEST(ScanContextTest, TestSetFilter) { ASSERT_TRUE(ctx->GetScanFilters()); ASSERT_EQ(10, ctx->GetScanFilters()->GetBucketFilter()); ASSERT_EQ(*predicate, *(ctx->GetScanFilters()->GetPredicate())); - auto result_vector_search = ctx->GetScanFilters()->GetVectorSearch(); - ASSERT_TRUE(result_vector_search); - ASSERT_EQ(query, result_vector_search->query); - ASSERT_EQ(VectorSearch::DistanceType::INNER_PRODUCT, - result_vector_search->distance_type.value()); ASSERT_EQ(partition_filters, ctx->GetScanFilters()->GetPartitionFilters()); ASSERT_EQ("{1,2,4,5}", ctx->GetGlobalIndexResult()->ToString()); std::map expected_options = {{"key", "value"}}; diff --git a/src/paimon/core/table/source/data_evolution_batch_scan.cpp b/src/paimon/core/table/source/data_evolution_batch_scan.cpp index 60e6b2084..7c617f46a 100644 --- a/src/paimon/core/table/source/data_evolution_batch_scan.cpp +++ b/src/paimon/core/table/source/data_evolution_batch_scan.cpp @@ -26,15 +26,13 @@ namespace paimon { DataEvolutionBatchScan::DataEvolutionBatchScan( const std::string& table_path, const std::shared_ptr& snapshot_reader, std::unique_ptr&& batch_scan, - const std::shared_ptr& global_index_result, - const std::shared_ptr& vector_search, const CoreOptions& core_options, + const std::shared_ptr& global_index_result, const CoreOptions& core_options, const std::shared_ptr& pool, const std::shared_ptr& executor) : AbstractTableScan(core_options, snapshot_reader), pool_(pool), table_path_(table_path), batch_scan_(std::move(batch_scan)), global_index_result_(global_index_result), - vector_search_(vector_search), executor_(executor) {} Result> DataEvolutionBatchScan::CreatePlan() { @@ -52,7 +50,12 @@ Result> DataEvolutionBatchScan::CreatePlan() { if (!row_ranges) { return batch_scan_->CreatePlan(); } - batch_scan_->WithRowRanges(row_ranges.value()); + if (row_ranges.value().empty()) { + return PlanImpl::EmptyPlan(); + } + PAIMON_ASSIGN_OR_RAISE(RowRangeIndex row_range_index, + RowRangeIndex::Create(row_ranges.value())); + batch_scan_->WithRowRangeIndex(row_range_index); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_plan, batch_scan_->CreatePlan()); std::map id_to_score; if (auto scored_result = @@ -64,14 +67,13 @@ Result> DataEvolutionBatchScan::CreatePlan() { id_to_score[id] = score; } } - return WrapToIndexedSplits(data_plan, row_ranges.value(), id_to_score); + return WrapToIndexedSplits(data_plan, row_range_index, id_to_score); } Result> DataEvolutionBatchScan::WrapToIndexedSplits( - const std::shared_ptr& data_plan, const std::vector& row_ranges, + const std::shared_ptr& data_plan, const RowRangeIndex& row_range_index, const std::map& id_to_score) const { - std::vector sorted_row_ranges = - Range::SortAndMergeOverlap(row_ranges, /*adjacent=*/true); + // TODO(lisizhuo.lsz): add executor here auto data_splits = data_plan->Splits(); std::vector> indexed_splits; indexed_splits.reserve(data_splits.size()); @@ -80,14 +82,22 @@ Result> DataEvolutionBatchScan::WrapToIndexedSplits( if (!data_split) { return Status::Invalid("Cannot cast split to DataSplit when create IndexedSplit"); } - std::vector file_ranges; - file_ranges.reserve(data_split->DataFiles().size()); - for (const auto& meta : data_split->DataFiles()) { - PAIMON_ASSIGN_OR_RAISE(int64_t first_row_id, meta->NonNullFirstRowId()); - file_ranges.emplace_back(first_row_id, first_row_id + meta->row_count - 1); + const auto& files = data_split->DataFiles(); + if (files.empty()) { + return Status::Invalid("Empty data files in WrapToIndexedSplits"); + } + PAIMON_ASSIGN_OR_RAISE(int64_t min, files[0]->NonNullFirstRowId()); + PAIMON_ASSIGN_OR_RAISE(int64_t max, files[files.size() - 1]->NonNullFirstRowId()); + max += files[files.size() - 1]->row_count - 1; + + std::vector expected = row_range_index.IntersectedRanges(min, max); + if (expected.empty()) { + return Status::Invalid( + fmt::format("There should be intersected ranges for split with min row id {} and " + "max row id {}.", + min, max)); } - auto sorted_file_ranges = Range::SortAndMergeOverlap(file_ranges, /*adjacent=*/true); - std::vector expected = Range::And(sorted_file_ranges, sorted_row_ranges); + std::vector scores; if (!id_to_score.empty()) { for (const auto& range : expected) { @@ -108,7 +118,7 @@ Result> DataEvolutionBatchScan::WrapToIndexedSplits( Result> DataEvolutionBatchScan::EvalGlobalIndex() const { auto predicate = batch_scan_->GetNonPartitionPredicate(); - if (!predicate && !vector_search_) { + if (!predicate) { return std::shared_ptr(nullptr); } if (!core_options_.GlobalIndexEnabled()) { @@ -124,31 +134,8 @@ Result> DataEvolutionBatchScan::EvalGlobalInd if (!index_scan_impl) { return Status::Invalid("invalid GlobalIndexScan, cannot cast to GlobalIndexScanImpl"); } - PAIMON_ASSIGN_OR_RAISE(std::vector indexed_row_ranges, index_scan->GetRowRangeList()); - if (indexed_row_ranges.empty()) { - return std::shared_ptr(nullptr); - } - const auto& snapshot = index_scan_impl->GetSnapshot(); - const std::optional& next_row_id = snapshot.NextRowId(); - if (!next_row_id) { - return Status::Invalid("invalid snapshot, next row id is null"); - } - std::vector non_indexed_row_ranges = - Range(0, next_row_id.value() - 1).Exclude(indexed_row_ranges); - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr index_result, - index_scan_impl->ParallelScan(indexed_row_ranges, predicate, vector_search_, executor_)); - if (!index_result) { - return std::shared_ptr(nullptr); - } - if (!non_indexed_row_ranges.empty()) { - for (const auto& range : non_indexed_row_ranges) { - PAIMON_ASSIGN_OR_RAISE(index_result, - index_result->Or(BitmapGlobalIndexResult::FromRanges({range}))); - } - } - return index_result; + return index_scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt); } } // namespace paimon diff --git a/src/paimon/core/table/source/data_evolution_batch_scan.h b/src/paimon/core/table/source/data_evolution_batch_scan.h index fad108ea7..88f20f762 100644 --- a/src/paimon/core/table/source/data_evolution_batch_scan.h +++ b/src/paimon/core/table/source/data_evolution_batch_scan.h @@ -32,7 +32,6 @@ class DataEvolutionBatchScan : public AbstractTableScan { const std::shared_ptr& snapshot_reader, std::unique_ptr&& batch_scan, const std::shared_ptr& global_index_result, - const std::shared_ptr& vector_search, const CoreOptions& core_options, const std::shared_ptr& pool, const std::shared_ptr& executor); @@ -40,7 +39,7 @@ class DataEvolutionBatchScan : public AbstractTableScan { private: Result> WrapToIndexedSplits( - const std::shared_ptr& data_plan, const std::vector& row_ranges, + const std::shared_ptr& data_plan, const RowRangeIndex& row_range_index, const std::map& id_to_score) const; Result> EvalGlobalIndex() const; @@ -49,7 +48,6 @@ class DataEvolutionBatchScan : public AbstractTableScan { std::string table_path_; std::unique_ptr batch_scan_; std::shared_ptr global_index_result_; - std::shared_ptr vector_search_; std::shared_ptr executor_; }; diff --git a/src/paimon/core/table/source/data_table_batch_scan.h b/src/paimon/core/table/source/data_table_batch_scan.h index 405b784d6..d5a1d44e6 100644 --- a/src/paimon/core/table/source/data_table_batch_scan.h +++ b/src/paimon/core/table/source/data_table_batch_scan.h @@ -44,8 +44,8 @@ class DataTableBatchScan : public AbstractTableScan { return snapshot_reader_->GetPartitionPredicate(); } - DataTableBatchScan* WithRowRanges(const std::vector& row_ranges) { - snapshot_reader_->WithRowRanges(row_ranges); + DataTableBatchScan* WithRowRangeIndex(const RowRangeIndex& row_range_index) { + snapshot_reader_->WithRowRangeIndex(row_range_index); return this; } diff --git a/src/paimon/core/table/source/snapshot/snapshot_reader.h b/src/paimon/core/table/source/snapshot/snapshot_reader.h index b590cd077..b00a96404 100644 --- a/src/paimon/core/table/source/snapshot/snapshot_reader.h +++ b/src/paimon/core/table/source/snapshot/snapshot_reader.h @@ -80,8 +80,8 @@ class SnapshotReader { return this; } - SnapshotReader* WithRowRanges(const std::vector& row_ranges) { - scan_->WithRowRanges(row_ranges); + SnapshotReader* WithRowRangeIndex(const RowRangeIndex& row_range_index) { + scan_->WithRowRangeIndex(row_range_index); return this; } diff --git a/src/paimon/core/table/source/table_scan.cpp b/src/paimon/core/table/source/table_scan.cpp index 43ac11b4f..e28b01c8b 100644 --- a/src/paimon/core/table/source/table_scan.cpp +++ b/src/paimon/core/table/source/table_scan.cpp @@ -242,8 +242,7 @@ Result> TableScan::Create(std::unique_ptr( context->GetPath(), snapshot_reader, std::move(batch_scan), context->GetGlobalIndexResult(), - context->GetScanFilters()->GetVectorSearch(), core_options, context->GetMemoryPool(), - context->GetExecutor()); + core_options, context->GetMemoryPool(), context->GetExecutor()); } } // namespace paimon diff --git a/test/inte/global_index_test.cpp b/test/inte/global_index_test.cpp index e0ea4f896..6cdcf8e71 100644 --- a/test/inte/global_index_test.cpp +++ b/test/inte/global_index_test.cpp @@ -20,13 +20,15 @@ #include "paimon/common/global_index/bitmap/bitmap_global_index_factory.h" #include "paimon/common/table/special_fields.h" #include "paimon/common/utils/scope_guard.h" +#include "paimon/core/global_index/global_index_scan_impl.h" #include "paimon/core/global_index/indexed_split_impl.h" -#include "paimon/core/global_index/row_range_global_index_scanner_impl.h" #include "paimon/core/table/source/data_split_impl.h" #include "paimon/defs.h" #include "paimon/fs/file_system.h" #include "paimon/global_index/bitmap_global_index_result.h" #include "paimon/global_index/bitmap_scored_global_index_result.h" +#include "paimon/global_index/global_index_reader.h" +#include "paimon/global_index/global_index_result.h" #include "paimon/global_index/global_index_scan.h" #include "paimon/global_index/global_index_write_task.h" #include "paimon/predicate/literal.h" @@ -151,12 +153,10 @@ class GlobalIndexTest : public ::testing::Test, public ::testing::WithParamInter Result> ScanGlobalIndexAndData( const std::string& table_path, const std::shared_ptr& predicate, - const std::shared_ptr& vector_search = nullptr, const std::map& options = {}, const std::shared_ptr& index_result = nullptr) const { ScanContextBuilder scan_context_builder(table_path); scan_context_builder.SetPredicate(predicate) - .SetVectorSearch(vector_search) .SetOptions(options) .SetGlobalIndexResult(index_result) .WithFileSystem(fs_); @@ -184,7 +184,7 @@ class GlobalIndexTest : public ::testing::Test, public ::testing::WithParamInter index_result = std::make_shared(std::move(bitmap), std::move(scores)); } - return ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, /*vector_search=*/nullptr, + return ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, /*options=*/{}, index_result); } @@ -432,26 +432,23 @@ TEST_P(GlobalIndexTest, TestScanIndex) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; ASSERT_OK_AND_ASSIGN( - auto global_index_scan, + std::shared_ptr global_index_scan, GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 7)})); - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); - auto scanner_impl = std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); // test index reader // test f0 field - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); ASSERT_OK_AND_ASSIGN(auto index_result, - index_reader->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); ASSERT_EQ(index_result->ToString(), "{0,7}"); // test f0, f1, f2 fields - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); + auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); { // test with non predicate - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(/*predicate=*/nullptr, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN( + auto index_result, + global_index_scan_impl->Scan(/*predicate=*/nullptr, /*row_range_index=*/std::nullopt)); ASSERT_FALSE(index_result); } { @@ -459,8 +456,8 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{0,7}"); } { @@ -468,40 +465,40 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{1,2,3,4,5,6}"); } { // test equal predicate for f1 auto predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{4,6,7}"); } { // test equal predicate for f2 auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(1)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{0,1,4,5}"); } { // test is null predicate auto predicate = PredicateBuilder::IsNull(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{7}"); } { // test is not null predicate auto predicate = PredicateBuilder::IsNotNull(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{0,1,2,3,4,5,6}"); } { @@ -510,8 +507,8 @@ TEST_P(GlobalIndexTest, TestScanIndex) { /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, {Literal(FieldType::STRING, "Alice", 5), Literal(FieldType::STRING, "Bob", 3), Literal(FieldType::STRING, "Lucy", 4)}); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{0,1,4,5,7}"); } { @@ -520,8 +517,8 @@ TEST_P(GlobalIndexTest, TestScanIndex) { /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, {Literal(FieldType::STRING, "Alice", 5), Literal(FieldType::STRING, "Bob", 3), Literal(FieldType::STRING, "Lucy", 4)}); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{2,3,6}"); } { @@ -532,8 +529,8 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{7}"); } { @@ -544,16 +541,16 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::Or({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{0,4,6,7}"); } { // test non-result auto predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(30)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{}"); } { @@ -568,48 +565,49 @@ TEST_P(GlobalIndexTest, TestScanIndex) { ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f1_predicate, f2_predicate, f0_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{}"); } { // test greater than predicate which bitmap index is not support, will return all range auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_FALSE(index_result); } { // test greater or equal predicate which bitmap index is not support, will return all range auto predicate = PredicateBuilder::GreaterOrEqual(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_FALSE(index_result); } { // test less than predicate which bitmap index is not support, will return all range auto predicate = PredicateBuilder::LessThan(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_FALSE(index_result); } { // test less or equal predicate which bitmap index is not support, will return all range auto predicate = PredicateBuilder::LessOrEqual(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_FALSE(index_result); } { // test a predicate for field with no index auto f3_predicate = PredicateBuilder::Equal(/*field_index=*/3, /*field_name=*/"f3", FieldType::DOUBLE, Literal(1.2)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(f3_predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN( + auto index_result, + global_index_scan_impl->Scan(f3_predicate, /*row_range_index=*/std::nullopt)); ASSERT_FALSE(index_result); } } @@ -623,26 +621,22 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshot) { "/append_with_global_index.db/append_with_global_index"; // snapshot 2 has f0 index ASSERT_OK_AND_ASSIGN( - auto global_index_scan, + std::shared_ptr global_index_scan, GlobalIndexScan::Create(table_path, /*snapshot_id=*/2l, /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 7)})); - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); - auto scanner_impl = std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); // test index reader // test f0 field - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); ASSERT_OK_AND_ASSIGN(auto index_result, - index_reader->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); ASSERT_EQ(index_result->ToString(), "{0,7}"); // test f1 field - ASSERT_OK_AND_ASSIGN(auto index_reader2, range_scanner->CreateReader("f1", "bitmap")); - ASSERT_FALSE(index_reader2); + ASSERT_OK_AND_ASSIGN(auto index_readers2, global_index_scan->CreateReaders("f1", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + + auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); - // test evaluator - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); { // test and predicate auto f0_predicate = @@ -651,8 +645,8 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshot) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_result->ToString(), "{0,7}"); } { @@ -663,8 +657,8 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshot) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::Or({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( + predicate, /*row_range_index=*/std::nullopt)); ASSERT_FALSE(index_result); } } @@ -678,26 +672,20 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshotWithNoIndex) { "/append_with_global_index.db/append_with_global_index"; // snapshot 1 has no index ASSERT_OK_AND_ASSIGN( - auto global_index_scan, + std::shared_ptr global_index_scan, GlobalIndexScan::Create(table_path, /*snapshot_id=*/1l, /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_TRUE(ranges.empty()); - - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); - auto scanner_impl = std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); // test index reader - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); - ASSERT_FALSE(index_reader); + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 0u); + + auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); - // test evaluator - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + global_index_scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); ASSERT_FALSE(index_result); } @@ -709,55 +697,39 @@ TEST_P(GlobalIndexTest, TestScanIndexWithRange) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; ASSERT_OK_AND_ASSIGN( - auto global_index_scan, + std::shared_ptr global_index_scan, GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 7)})); + auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); { - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 3))); - auto scanner_impl = - std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - // test index reader - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); ASSERT_OK_AND_ASSIGN(auto index_result, - index_reader->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); ASSERT_EQ(index_result->ToString(), "{0,7}"); - { - // test non-exist index type - ASSERT_OK_AND_ASSIGN(auto non_exist_index_reader, - range_scanner->CreateReader("f0", "non-exist")); - ASSERT_FALSE(non_exist_index_reader); - } - - // test evaluator - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto evaluator_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN( + auto evaluator_result, + global_index_scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); ASSERT_EQ(evaluator_result->ToString(), "{1,2,3,4,5,6}"); } { - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(10, 13))); - auto scanner_impl = - std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); + // invalid range + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, RowRangeIndex::Create({Range(10, 13)})); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", row_range_index)); + ASSERT_EQ(index_readers.size(), 0u); - // test index reader - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); - ASSERT_FALSE(index_reader); - // test evaluator - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + global_index_scan_impl->Scan(predicate, row_range_index)); ASSERT_FALSE(index_result); } } @@ -773,32 +745,29 @@ TEST_P(GlobalIndexTest, TestScanIndexWithPartition) { "/append_with_global_index_with_partition.db/append_with_global_index_with_partition"; auto check_result = [&](const std::optional>>& partitions) { - ASSERT_OK_AND_ASSIGN(auto global_index_scan, + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, partitions, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 4)})); - ASSERT_OK_AND_ASSIGN(auto range_scanner, - global_index_scan->CreateRangeScan(Range(0, 4))); - auto scanner_impl = - std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - // test index reader - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); - ASSERT_OK_AND_ASSIGN(auto index_result, - index_reader->VisitEqual(Literal(FieldType::STRING, "Bob", 3))); + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, + RowRangeIndex::Create({Range(0, 4)})); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", row_range_index)); + ASSERT_EQ(index_readers.size(), 1u); + ASSERT_OK_AND_ASSIGN(auto index_result, index_readers[0]->VisitEqual( + Literal(FieldType::STRING, "Bob", 3))); ASSERT_EQ(index_result->ToString(), "{1,4}"); - // test evaluator - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); + auto global_index_scan_impl = + std::dynamic_pointer_cast(global_index_scan); + { // null result as f2 does not have index auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(1)); ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + global_index_scan_impl->Scan(predicate, row_range_index)); ASSERT_FALSE(index_result); } { @@ -807,7 +776,7 @@ TEST_P(GlobalIndexTest, TestScanIndexWithPartition) { FieldType::STRING, Literal(FieldType::STRING, "Bob", 3)); ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + global_index_scan_impl->Scan(predicate, row_range_index)); ASSERT_EQ(index_result->ToString(), "{0,2,3}"); } { @@ -816,7 +785,7 @@ TEST_P(GlobalIndexTest, TestScanIndexWithPartition) { FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + global_index_scan_impl->Scan(predicate, row_range_index)); ASSERT_EQ(index_result->ToString(), "{0}"); } }; @@ -839,22 +808,19 @@ TEST_P(GlobalIndexTest, TestScanUnregisteredIndex) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; ASSERT_OK_AND_ASSIGN( - auto global_index_scan, + std::shared_ptr global_index_scan, GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); - auto scanner_impl = std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); - ASSERT_FALSE(index_reader); + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 0u); - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); + auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Bob", 3)); ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + global_index_scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); ASSERT_FALSE(index_result); } @@ -886,14 +852,10 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndex) { auto global_index_scan, GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 7)})); - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); - auto scanner_impl = std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); ASSERT_OK_AND_ASSIGN(auto index_result, - index_reader->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); ASSERT_EQ(index_result->ToString(), "{0,7}"); } @@ -955,27 +917,18 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { write_data_and_index(src_array2, {{"f2", "20"}}, Range(4, 8)); auto scan_and_check_result = [&](const std::map& partition, - const Range& expected_range, VectorSearch::PreFilter filter, - int32_t limit, const std::string& bitmap_result, + const std::optional& row_range_index, + VectorSearch::PreFilter filter, int32_t limit, + const std::string& bitmap_result, const std::string& lumina_result, - const std::vector& read_row_ranges, const std::shared_ptr& expected_array, const std::map& id_to_score) { std::vector> partitions = {partition}; - ASSERT_OK_AND_ASSIGN(auto global_index_scan, + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, partitions, lumina_options, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({expected_range})); - - ASSERT_OK_AND_ASSIGN(auto range_scanner, - global_index_scan->CreateRangeScan(expected_range)); - auto scanner_impl = - std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - // check bitmap index - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); + auto scanner_impl = std::dynamic_pointer_cast(global_index_scan); auto predicate1 = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, @@ -985,14 +938,15 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { Literal(FieldType::STRING, "Paul", 4)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::Or({predicate1, predicate2})); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, scanner_impl->Scan(predicate, row_range_index)); ASSERT_TRUE(index_result); ASSERT_EQ(index_result->ToString(), bitmap_result); // check lumina index - ASSERT_OK_AND_ASSIGN(auto lumina_reader, range_scanner->CreateReader("f1", "lumina")); - + ASSERT_OK_AND_ASSIGN(auto lumina_readers, + global_index_scan->CreateReaders("f1", row_range_index)); + ASSERT_EQ(lumina_readers.size(), 1u); + auto lumina_reader = lumina_readers[0]; std::vector query = {1.0f, 1.0f, 1.0f, 1.1f}; auto vector_search = std::make_shared( "f1", limit, query, filter, @@ -1000,21 +954,11 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { ASSERT_OK_AND_ASSIGN(auto scored_result, lumina_reader->VisitVectorSearch(vector_search)); ASSERT_EQ(scored_result->ToString(), lumina_result); - // check evaluate predicate and vector search - auto vector_search_without_filter = vector_search->ReplacePreFilter(nullptr); - ASSERT_OK_AND_ASSIGN(auto compound_index_result, - evaluator->Evaluate(predicate, vector_search_without_filter)); - ASSERT_TRUE(compound_index_result); - ASSERT_EQ(compound_index_result->ToString(), lumina_result); - // check read array std::vector read_field_names = schema->field_names(); read_field_names.push_back("_INDEX_SCORE"); - ASSERT_OK_AND_ASSIGN(auto result_with_offset, - compound_index_result->AddOffset(expected_range.from)); ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, - /*vector_search=*/nullptr, - /*options=*/{}, result_with_offset)); + /*options=*/{}, scored_result)); ASSERT_OK(ReadData(table_path, read_field_names, expected_array, /*predicate=*/nullptr, plan)); }; @@ -1034,9 +978,9 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { [0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, 4.21] ])") .ValueOrDie(); - scan_and_check_result({{"f2", "10"}}, Range(0, 3), filter, /*limit=*/2, "{0}", - "row ids: {0}, scores: {4.21}", {Range(0, 0)}, expected_array, - id_to_score1); + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, RowRangeIndex::Create({Range(0, 3)})); + scan_and_check_result({{"f2", "10"}}, row_range_index, filter, /*limit=*/2, "{0}", + "row ids: {0}, scores: {4.21}", expected_array, id_to_score1); } { // test scan and read for f2=20 @@ -1046,18 +990,9 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { [0, "Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1, 322.21] ])") .ValueOrDie(); - scan_and_check_result({{"f2", "20"}}, Range(4, 8), filter, /*limit=*/1, "{3,4}", - "row ids: {4}, scores: {322.21}", {Range(4, 4)}, expected_array, - id_to_score2); - } - { - // test invalid range input - ASSERT_OK_AND_ASSIGN( - auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, lumina_options, fs_, pool_)); - ASSERT_NOK_WITH_MSG(global_index_scan->CreateRangeScan(Range(0, 8)), - "input range contain multiple partitions, fail to create range scan"); + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, RowRangeIndex::Create({Range(4, 8)})); + scan_and_check_result({{"f2", "20"}}, row_range_index, filter, /*limit=*/1, "{7,8}", + "row ids: {8}, scores: {322.21}", expected_array, id_to_score2); } { // test invalid partition input @@ -1287,215 +1222,12 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScan) { PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); ASSERT_OK_AND_ASSIGN( - auto plan, ScanGlobalIndexAndData(table_path, predicate, /*vector_search=*/nullptr, - {{"global-index.enabled", "false"}})); + auto plan, + ScanGlobalIndexAndData(table_path, predicate, {{"global-index.enabled", "false"}})); ASSERT_OK(ReadData(table_path, write_cols, expected_all_array, predicate, plan)); } } -#ifdef PAIMON_ENABLE_LUMINA -TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithVectorSearch) { - arrow::FieldVector fields = { - arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::list(arrow::float32())), - arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}; - std::map lumina_write_options = {{"lumina.index.dimension", "4"}, - {"lumina.index.type", "bruteforce"}, - {"lumina.distance.metric", "l2"}, - {"lumina.encoding.type", "rawf32"}}; - std::map lumina_read_options = { - {"lumina.search.parallel_number", "10"}}; - - auto schema = arrow::schema(fields); - std::map options = {{Options::MANIFEST_FORMAT, "orc"}, - {Options::FILE_FORMAT, file_format_}, - {Options::FILE_SYSTEM, "local"}, - {Options::ROW_TRACKING_ENABLED, "true"}, - {Options::DATA_EVOLUTION_ENABLED, "true"}}; - CreateTable(/*partition_keys=*/{}, schema, options); - - std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); - std::vector write_cols = schema->field_names(); - - auto src_array = std::dynamic_pointer_cast( - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ -["Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], -["Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1], -["Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1], -["Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1], -["Lucy", [10.0, 10.0, 10.0, 10.0], 20, 15.1], -["Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1], -["Tony", [11.0, 10.0, 11.0, 10.0], 20, 17.1], -["Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1], -["Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1] - ])") - .ValueOrDie()); - ASSERT_OK_AND_ASSIGN(auto commit_msgs, WriteArray(table_path, write_cols, src_array)); - ASSERT_OK(Commit(table_path, commit_msgs)); - - auto result_fields = fields; - result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); - { - // read when no index is built - auto predicate = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - auto vector_search = std::make_shared( - "f1", /*limit=*/1, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, - lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], -[0, "Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1], -[0, "Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1], -[0, "Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1], -[0, "Lucy", [10.0, 10.0, 10.0, 10.0], 20, 15.1], -[0, "Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1], -[0, "Tony", [11.0, 10.0, 11.0, 10.0], 20, 17.1], -[0, "Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1], -[0, "Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, write_cols, expected_array, predicate, plan)); - } - - // write and commit bitmap global index - ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", /*options=*/{}, - Range(0, 8))); - - auto read_cols = write_cols; - read_cols.push_back("_INDEX_SCORE"); - result_fields.insert(result_fields.end(), SpecialFields::IndexScore().ArrowField()); - { - // read when only bitmap index is built - auto predicate = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - auto vector_search = std::make_shared( - "f1", /*limit=*/1, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, - lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, null], -[0, "Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1, null] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, predicate, plan)); - } - - // write and commit lumina global index - ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f1", "lumina", - /*options=*/lumina_write_options, Range(0, 8))); - - // scan and read with global index - { - auto predicate = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - auto vector_search = std::make_shared( - "f1", /*limit=*/1, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, - lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, 4.21] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, predicate, plan)); - } - { - auto predicate = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - auto vector_search = std::make_shared( - "f1", /*limit=*/3, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, - lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, 4.21], -[0, "Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1, 398.01] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, predicate, plan)); - } - { - auto predicate = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Bob", 3)); - auto vector_search = std::make_shared( - "f1", /*limit=*/3, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, - lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1, 2.01], -[0, "Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1, 360.01] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, predicate, plan)); - } - { - // test only has vector search with pre_filter - auto vector_search = std::make_shared( - "f1", /*limit=*/3, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), - /*filter=*/[](int64_t row_id) { return row_id == 1 || row_id == 5; }, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, - vector_search, lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1, 2.01], -[0, "Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1, 360.01] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, /*predicate=*/nullptr, plan)); - } - { - // test only has vector search with no pre_filter - auto vector_search = std::make_shared( - "f1", /*limit=*/2, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), - /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, - vector_search, lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1, 2.01], -[0, "Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1, 0.01] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, /*predicate=*/nullptr, plan)); - } - { - // test invalid vector search - auto predicate = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Bob", 3)); - auto vector_search = std::make_shared( - "f1", /*limit=*/3, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), - /*filter=*/[](int64_t row_id) { return true; }, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_NOK_WITH_MSG( - ScanGlobalIndexAndData(table_path, predicate, vector_search, lumina_read_options), - "Predicate result and pre_filter in VectorSearch conflict"); - } -} -#endif - TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithOnlyOnePartitionHasIndex) { CreateTable(/*partition_keys=*/{"f1"}); std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); @@ -1531,17 +1263,13 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithOnlyOnePartitionHasIndex) auto result_fields = fields_; result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); { - // only f1 = 10 partition has index, f1 = 20 partition will not be filtered auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate)); auto expected_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", 10, 1, 11.1], -[0, "Lucy", 20, 1, 15.1], -[0, "Tony", 20, 0, 17.1], -[0, "Alice", 20, null, 18.1] +[0, "Alice", 10, 1, 11.1] ])") .ValueOrDie(); @@ -1588,34 +1316,24 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithTwoIndexInDiffTwoPartition auto result_fields = fields_; result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); { - // only f1 = 10 partition has f0 index, f1 = 20 partition will not be filtered auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate)); auto expected_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", 10, 1, 11.1], -[0, "Lucy", 20, 1, 15.1], -[0, "Tony", 20, 0, 17.1], -[0, "Alice", 20, null, 18.1] +[0, "Alice", 10, 1, 11.1] ])") .ValueOrDie(); ASSERT_OK(ReadData(table_path, write_cols, expected_array, predicate, plan)); } { - // only f1 = 20 partition has f2 index, f1 = 10 partition will not be filtered auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(1)); ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate)); auto expected_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", 10, 1, 11.1], -[0, "Bob", 10, 1, 12.1], -[0, "Emily", 10, 0, 13.1], -[0, "Tony", 10, 0, 14.1], -[0, "Bob", 10, 1, 16.1], [0, "Lucy", 20, 1, 15.1] ])") .ValueOrDie(); @@ -1623,6 +1341,8 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithTwoIndexInDiffTwoPartition ASSERT_OK(ReadData(table_path, write_cols, expected_array, predicate, plan)); } { + // only f1 = 10 partition has f0 index, query predicate1 results in ["Alice", 10, 1, 11.1] + // only f2 = 20 partition has f2 index, query predicate2 results in ["Lucy", 20, 1, 15.1] auto predicate1 = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); @@ -1630,14 +1350,7 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithTwoIndexInDiffTwoPartition FieldType::INT, Literal(1)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({predicate1, predicate2})); ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate)); - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", 10, 1, 11.1], -[0, "Lucy", 20, 1, 15.1] - ])") - .ValueOrDie(); - - ASSERT_OK(ReadData(table_path, write_cols, expected_array, predicate, plan)); + ASSERT_OK(ReadData(table_path, write_cols, /*expected_array=*/nullptr, predicate, plan)); } { // predicate2 is partition filter @@ -1764,73 +1477,11 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithTwoPartitionAllWithIndex) } } -#ifdef PAIMON_ENABLE_LUMINA -TEST_P(GlobalIndexTest, TestInvalidGetRowRangeListWithIndexRangeMismatchViaDifferentType) { - arrow::FieldVector fields = { - arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::list(arrow::float32())), - arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}; - std::map lumina_options = {{"lumina.index.dimension", "4"}, - {"lumina.index.type", "bruteforce"}, - {"lumina.distance.metric", "l2"}, - {"lumina.encoding.type", "rawf32"}, - {"lumina.search.parallel_number", "10"}}; - auto schema = arrow::schema(fields); - std::map options = {{Options::MANIFEST_FORMAT, "orc"}, - {Options::FILE_FORMAT, file_format_}, - {Options::FILE_SYSTEM, "local"}, - {Options::ROW_TRACKING_ENABLED, "true"}, - {Options::DATA_EVOLUTION_ENABLED, "true"}}; - CreateTable(/*partition_keys=*/{"f2"}, schema, options); - +TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithPartitionWithTwoFields) { + CreateTable(/*partition_keys=*/{"f1", "f2"}); std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); - std::vector write_cols = schema->field_names(); - // write partition f2 = 10 - auto src_array1 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ -["Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], -["Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1], -["Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1], -["Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1] - ])") - .ValueOrDie(); - ASSERT_OK_AND_ASSIGN(auto commit_msgs1, - WriteArray(table_path, {{"f2", "10"}}, write_cols, src_array1)); - ASSERT_OK(Commit(table_path, commit_msgs1)); - - // write partition f2 = 20 - auto src_array2 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ -["Lucy", [10.0, 10.0, 10.0, 10.0], 20, 15.1], -["Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1], -["Tony", [11.0, 10.0, 11.0, 10.0], 20, 17.1], -["Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1], -["Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1] - ])") - .ValueOrDie(); - ASSERT_OK_AND_ASSIGN(auto commit_msgs2, - WriteArray(table_path, {{"f2", "20"}}, write_cols, src_array2)); - ASSERT_OK(Commit(table_path, commit_msgs2)); - - // write and commit bitmap global index for f2 = 10 - ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{{{"f2", "10"}}}, "f0", "bitmap", - /*options=*/{}, Range(0, 3))); - - // write and commit lumina global index for f2 = 20 - ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{{{"f2", "20"}}}, "f1", "lumina", - /*options=*/lumina_options, Range(4, 8))); - - ASSERT_OK_AND_ASSIGN(auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, - /*options=*/lumina_options, fs_, pool_)); - ASSERT_NOK_WITH_MSG(global_index_scan->GetRowRangeList(), - "Inconsistent row ranges among index types"); -} -#endif - -TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithPartitionWithTwoFields) { - CreateTable(/*partition_keys=*/{"f1", "f2"}); - std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); - auto schema = arrow::schema(fields_); - // write and commit data + auto schema = arrow::schema(fields_); + // write and commit data std::vector write_cols = schema->field_names(); auto src_array1 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ @@ -1977,18 +1628,15 @@ TEST_P(GlobalIndexTest, TestScanIndexWithTwoIndexes) { GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, /*partitions=*/std::nullopt, /*options=*/lumina_options, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 8)})); - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 8))); // query f0 - ASSERT_OK_AND_ASSIGN(auto index_readers, range_scanner->CreateReaders("f0")); + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); ASSERT_EQ(index_readers.size(), 1); ASSERT_OK_AND_ASSIGN(auto index_result, index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); ASSERT_EQ(index_result->ToString(), "{0,7}"); // query f1 - ASSERT_OK_AND_ASSIGN(index_readers, range_scanner->CreateReaders("f1")); + ASSERT_OK_AND_ASSIGN(index_readers, global_index_scan->CreateReaders("f1", std::nullopt)); ASSERT_EQ(index_readers.size(), 1); std::vector query = {11.0f, 11.0f, 11.0f, 11.0f}; ASSERT_OK_AND_ASSIGN( @@ -1999,19 +1647,15 @@ TEST_P(GlobalIndexTest, TestScanIndexWithTwoIndexes) { ASSERT_EQ(scored_result->ToString(), "row ids: {7}, scores: {0.00}"); // query f2 - ASSERT_OK_AND_ASSIGN(index_readers, range_scanner->CreateReaders("f2")); + ASSERT_OK_AND_ASSIGN(index_readers, global_index_scan->CreateReaders("f2", std::nullopt)); ASSERT_EQ(index_readers.size(), 0); } +#endif TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithExternalPath) { arrow::FieldVector fields = { arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::list(arrow::float32())), arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}; - std::map lumina_options = {{"lumina.index.dimension", "4"}, - {"lumina.index.type", "bruteforce"}, - {"lumina.distance.metric", "l2"}, - {"lumina.encoding.type", "rawf32"}, - {"lumina.search.parallel_number", "10"}}; auto schema = arrow::schema(fields); std::map options = {{Options::MANIFEST_FORMAT, "orc"}, {Options::FILE_FORMAT, file_format_}, @@ -2041,40 +1685,28 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithExternalPath) { // write and commit bitmap global index auto external_dir1 = UniqueTestDirectory::Create("local"); - ASSERT_OK( - WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", - /*options=*/{{"global-index.external-path", "FILE://" + external_dir1->Str()}}, - Range(0, 8))); - - auto external_dir2 = UniqueTestDirectory::Create("local"); - auto lumina_options_with_external_path = lumina_options; - lumina_options_with_external_path["global-index.external-path"] = - "FILE://" + external_dir2->Str(); - ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f1", "lumina", - /*options=*/lumina_options_with_external_path, Range(0, 8))); + std::map index_options = { + {"global-index.external-path", "FILE://" + external_dir1->Str()}}; + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", index_options, + Range(0, 8))); - auto read_cols = write_cols; - read_cols.push_back("_INDEX_SCORE"); auto result_fields = fields; result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); - result_fields.insert(result_fields.end(), SpecialFields::IndexScore().ArrowField()); // test scan and read auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - auto vector_search = std::make_shared( - "f1", /*limit=*/1, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_options); - ASSERT_OK_AND_ASSIGN( - auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, lumina_options)); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate, index_options)); auto expected_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, 4.21] +[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], +[0, "Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1] ])") .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, predicate, plan)); + + ASSERT_OK(ReadData(table_path, write_cols, expected_array, predicate, plan)); } TEST_P(GlobalIndexTest, TestIOException) { @@ -2100,11 +1732,6 @@ TEST_P(GlobalIndexTest, TestIOException) { {Options::FILE_SYSTEM, "local"}, {Options::ROW_TRACKING_ENABLED, "true"}, {Options::DATA_EVOLUTION_ENABLED, "true"}}; - std::map lumina_options = {{"lumina.index.dimension", "4"}, - {"lumina.index.type", "bruteforce"}, - {"lumina.distance.metric", "l2"}, - {"lumina.encoding.type", "rawf32"}, - {"lumina.search.parallel_number", "10"}}; std::string table_path; bool write_run_complete = false; auto io_hook = IOHook::GetInstance(); @@ -2123,17 +1750,12 @@ TEST_P(GlobalIndexTest, TestIOException) { WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", /*options=*/{}, Range(0, 3)); CHECK_HOOK_STATUS(bitmap_index_write_status, i); - // write lumina index - auto lumina_index_write_status = - WriteIndex(table_path, /*partition_filters=*/{}, "f1", "lumina", - /*options=*/lumina_options, Range(0, 3)); - CHECK_HOOK_STATUS_WITHOUT_MESSAGE_CHECK(lumina_index_write_status); write_run_complete = true; break; } ASSERT_TRUE(write_run_complete); - // read for bitmap and lumina + // read for bitmap bool read_run_complete = false; for (size_t i = 0; i < 2000; i += paimon::test::RandomNumber(20, 30)) { ScopeGuard guard([&io_hook]() { io_hook->Clear(); }); @@ -2141,34 +1763,27 @@ TEST_P(GlobalIndexTest, TestIOException) { auto result_fields = fields; result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); - result_fields.insert(result_fields.end(), SpecialFields::IndexScore().ArrowField()); - auto read_cols = write_cols; - read_cols.push_back("_INDEX_SCORE"); auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - auto vector_search = std::make_shared( - "f1", /*limit=*/1, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_options); auto expected_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", [1.0, 0.0, 1.0, 0.0], 10, 13.1, 2.21] +[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], +[0, "Alice", [1.0, 0.0, 1.0, 0.0], 10, 13.1] ])") .ValueOrDie(); - auto plan_result = - ScanGlobalIndexAndData(table_path, predicate, vector_search, lumina_options); + auto plan_result = ScanGlobalIndexAndData(table_path, predicate); CHECK_HOOK_STATUS_WITHOUT_MESSAGE_CHECK(plan_result.status()); auto plan = std::move(plan_result).value(); - auto read_status = ReadData(table_path, read_cols, expected_array, predicate, plan); + auto read_status = ReadData(table_path, write_cols, expected_array, predicate, plan); CHECK_HOOK_STATUS(read_status, i); read_run_complete = true; break; } ASSERT_TRUE(read_run_complete); } -#endif TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithRangeBitmap) { CreateTable(); @@ -2464,17 +2079,14 @@ TEST_P(GlobalIndexTest, TestLuceneWriteCommitScanReadIndexWithScore) { /*options=*/lucene_options, Range(0, 3))); ASSERT_OK_AND_ASSIGN( - auto global_index_scan, + std::shared_ptr global_index_scan, GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 3)})); - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 3))); - auto scanner_impl = std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - // test f0 field - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "lucene-fts")); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", /*row_rangw_index=*/std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + auto index_reader = index_readers[0]; { ASSERT_OK_AND_ASSIGN(auto index_result, index_reader->VisitFullTextSearch(std::make_shared( @@ -2503,6 +2115,777 @@ TEST_P(GlobalIndexTest, TestLuceneWriteCommitScanReadIndexWithScore) { } #endif +TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndex) { + // BTreeGlobalIndexWriter requires keys to be written in monotonically increasing order. + // Therefore the source data must be pre-sorted by the indexed column (f0, string). + CreateTable(); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + auto schema = arrow::schema(fields_); + + std::vector write_cols = schema->field_names(); + + // Data sorted by f0 (string, ascending): Alice < Bob < Bob < Emily < Lucy < Tony < Tony + // The last row has f0=null which is treated separately by the null bitmap. + auto src_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ +["Alice", 10, 1, 11.1], +["Bob", 10, 1, 12.1], +["Bob", 20, 0, 16.1], +["Emily", 10, 0, 13.1], +["Lucy", 20, 1, 15.1], +["Tony", 10, 0, 14.1], +["Tony", 20, 0, 17.1], +[null, 20, null, 18.1] + ])") + .ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto commit_msgs, WriteArray(table_path, write_cols, src_array)); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // Write btree-global index on f0 + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "btree", + /*options=*/{}, Range(0, 7))); + + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", /*row_range_index=*/std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + auto index_reader = index_readers[0]; + + { + // VisitEqual: "Alice" -> row 0 + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0}"); + } + { + // VisitEqual: "Bob" -> rows 1,2 + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitEqual(Literal(FieldType::STRING, "Bob", 3))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{1,2}"); + } + { + // VisitEqual: non-existent key -> empty + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitEqual(Literal(FieldType::STRING, "Zara", 4))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{}"); + } + { + // VisitNotEqual: "Bob" -> all non-null except Bob rows -> {0,3,4,5,6} + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitNotEqual(Literal(FieldType::STRING, "Bob", 3))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,3,4,5,6}"); + } + { + // VisitIsNull -> row 7 (null key) + ASSERT_OK_AND_ASSIGN(auto result, index_reader->VisitIsNull()); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{7}"); + } + { + // VisitIsNotNull -> rows 0-6 + ASSERT_OK_AND_ASSIGN(auto result, index_reader->VisitIsNotNull()); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,1,2,3,4,5,6}"); + } + { + // VisitIn: {"Alice", "Lucy"} -> rows {0, 4} + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitIn({Literal(FieldType::STRING, "Alice", 5), + Literal(FieldType::STRING, "Lucy", 4)})); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,4}"); + } + { + // VisitNotIn: {"Alice", "Lucy"} -> all non-null except {0,4} -> {1,2,3,5,6} + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitNotIn({Literal(FieldType::STRING, "Alice", 5), + Literal(FieldType::STRING, "Lucy", 4)})); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{1,2,3,5,6}"); + } + { + // VisitLessThan: "Emily" -> keys < "Emily" -> Alice(0), Bob(1,2) -> {0,1,2} + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitLessThan(Literal(FieldType::STRING, "Emily", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,1,2}"); + } + { + // VisitLessOrEqual: "Emily" -> keys <= "Emily" -> Alice(0), Bob(1,2), Emily(3) -> + // {0,1,2,3} + ASSERT_OK_AND_ASSIGN( + auto result, index_reader->VisitLessOrEqual(Literal(FieldType::STRING, "Emily", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,1,2,3}"); + } + { + // VisitGreaterThan: "Emily" -> keys > "Emily" -> Lucy(4), Tony(5,6) -> {4,5,6} + ASSERT_OK_AND_ASSIGN( + auto result, index_reader->VisitGreaterThan(Literal(FieldType::STRING, "Emily", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{4,5,6}"); + } + { + // VisitGreaterOrEqual: "Emily" -> keys >= "Emily" -> Emily(3), Lucy(4), Tony(5,6) -> + // {3,4,5,6} + ASSERT_OK_AND_ASSIGN( + auto result, index_reader->VisitGreaterOrEqual(Literal(FieldType::STRING, "Emily", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{3,4,5,6}"); + } + + auto scan_impl = std::dynamic_pointer_cast(global_index_scan); + ASSERT_TRUE(scan_impl); + { + // Equal predicate via evaluator + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Tony", 4)); + ASSERT_OK_AND_ASSIGN(auto result, + scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{5,6}"); + } + { + // AND predicate: f0 == "Bob" AND f1 == 20 + // f0 == "Bob" -> {1,2}, but f1 index does not exist -> AND yields {1,2} + // (fields without index return nullptr, AND with nullptr keeps the other side) + auto f0_predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Bob", 3)); + auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", + FieldType::INT, Literal(20)); + ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f0_predicate, f1_predicate})); + ASSERT_OK_AND_ASSIGN(auto result, + scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{1,2}"); + } + { + // row_range_index filtering: range [0,2] should only load that range + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, RowRangeIndex::Create({Range(0, 2)})); + ASSERT_OK_AND_ASSIGN(auto range_readers, + global_index_scan->CreateReaders("f0", row_range_index)); + ASSERT_EQ(range_readers.size(), 1u); + ASSERT_OK_AND_ASSIGN(auto result, + range_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0}"); + } + { + // Invalid row_range_index: no intersection -> empty readers + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, + RowRangeIndex::Create({Range(100, 200)})); + ASSERT_OK_AND_ASSIGN(auto range_readers, + global_index_scan->CreateReaders("f0", row_range_index)); + ASSERT_EQ(range_readers.size(), 0u); + } + + // Test full pipeline: scan with predicate -> read data + { + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Bob", 3)); + auto scan_impl = std::dynamic_pointer_cast(global_index_scan); + ASSERT_OK_AND_ASSIGN(auto index_result, + scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_TRUE(index_result); + ASSERT_EQ(index_result->ToString(), "{1,2}"); + + auto result_fields = fields_; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Bob", 10, 1, 12.1], +[0, "Bob", 20, 0, 16.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, + /*options=*/{}, index_result)); + ASSERT_OK(ReadData(table_path, write_cols, expected_array, /*predicate=*/nullptr, plan)); + } +} + +TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndexWithPartition) { + // BTree index with partitioned table. Each partition's data is sorted by f0 independently. + auto schema = arrow::schema(fields_); + std::map options = {{Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, file_format_}, + {Options::FILE_SYSTEM, "local"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}}; + CreateTable(/*partition_keys=*/{"f1"}, schema, options); + + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + std::vector write_cols = schema->field_names(); + + // Write partition f1=10. Data sorted by f0: Alice < Bob < Bob < Emily < Tony + auto src_array1 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ +["Alice", 10, 1, 11.1], +["Bob", 10, 1, 12.1], +["Bob", 10, 0, 13.1], +["Emily", 10, 0, 14.1], +["Tony", 10, 1, 15.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto commit_msgs1, + WriteArray(table_path, {{"f1", "10"}}, write_cols, src_array1)); + ASSERT_OK(Commit(table_path, commit_msgs1)); + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{{{"f1", "10"}}}, "f0", "btree", + /*options=*/{}, Range(0, 4))); + + // Write partition f1=20. Data sorted by f0: Alice < Lucy < Tony + auto src_array2 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ +["Alice", 20, null, 16.1], +["Lucy", 20, 1, 17.1], +["Tony", 20, 0, 18.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto commit_msgs2, + WriteArray(table_path, {{"f1", "20"}}, write_cols, src_array2)); + ASSERT_OK(Commit(table_path, commit_msgs2)); + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{{{"f1", "20"}}}, "f0", "btree", + /*options=*/{}, Range(5, 7))); + + // Scan all partitions + { + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", std::nullopt)); + // One reader per partition range -> 2 ranges -> UnionGlobalIndexReader wraps them + ASSERT_EQ(index_readers.size(), 1u); + + // "Alice" exists in both partitions: local ids {0} in range [0,4] -> global 0, + // and local ids {0} in range [5,7] -> global 5 + ASSERT_OK_AND_ASSIGN(auto result, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,5}"); + + // "Bob" only in partition f1=10: local ids {1,2} -> global {1,2} + ASSERT_OK_AND_ASSIGN(auto result2, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Bob", 3))); + ASSERT_TRUE(result2); + ASSERT_EQ(result2->ToString(), "{1,2}"); + + // "Lucy" only in partition f1=20: local ids {1} -> global {6} + ASSERT_OK_AND_ASSIGN(auto result3, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Lucy", 4))); + ASSERT_TRUE(result3); + ASSERT_EQ(result3->ToString(), "{6}"); + } + + // Scan with partition filter: only f1=10 + { + std::vector> partitions = {{{"f1", "10"}}}; + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, partitions, + /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + + // "Alice" in f1=10 only -> global {0} + ASSERT_OK_AND_ASSIGN(auto result, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0}"); + + // "Lucy" not in f1=10 -> empty + ASSERT_OK_AND_ASSIGN(auto result2, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Lucy", 4))); + ASSERT_TRUE(result2); + ASSERT_EQ(result2->ToString(), "{}"); + } + + // Scan with row_range_index filtering: only range [5,7] (partition f1=20) + { + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, RowRangeIndex::Create({Range(5, 7)})); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", row_range_index)); + ASSERT_EQ(index_readers.size(), 1u); + + // "Tony" in range [5,7]: local id {2} in range [5,7] -> global {7} + ASSERT_OK_AND_ASSIGN(auto result, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Tony", 4))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{7}"); + } + + // Full pipeline with evaluator: Scan(predicate) -> read data + { + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + auto scanner_impl = std::dynamic_pointer_cast(global_index_scan); + ASSERT_TRUE(scanner_impl); + + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Tony", 4)); + ASSERT_OK_AND_ASSIGN(auto index_result, + scanner_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_TRUE(index_result); + ASSERT_EQ(index_result->ToString(), "{4,7}"); + + auto result_fields = fields_; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Tony", 10, 1, 15.1], +[0, "Tony", 20, 0, 18.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, + /*options=*/{}, index_result)); + ASSERT_OK(ReadData(table_path, write_cols, expected_array, /*predicate=*/nullptr, plan)); + } +} + +TEST_P(GlobalIndexTest, TestBTreeAndBitmapCoexist) { + // Test btree-global and bitmap index coexisting on the same field (f0). + // The evaluator should AND their results, producing the intersection. + CreateTable(); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + auto schema = arrow::schema(fields_); + std::vector write_cols = schema->field_names(); + + // Data sorted by f0 for btree: Alice < Bob < Bob < Emily < Lucy < Tony < Tony < null + auto src_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ +["Alice", 10, 1, 11.1], +["Bob", 10, 1, 12.1], +["Bob", 25, 1, 16.1], +["Emily", 15, 0, 13.1], +["Lucy", 20, 1, 15.1], +["Tony", 20, 0, 14.1], +["Tony", 30, 0, 17.1], +[null, 30, null, 18.1] + ])") + .ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto commit_msgs, WriteArray(table_path, write_cols, src_array)); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // Build both indexes on f0 + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "btree", + /*options=*/{}, Range(0, 7))); + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", + /*options=*/{}, Range(0, 7))); + + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + + // Two index types on f0 -> 2 readers + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 2u); + + // Each reader individually should return the same result for Equal("Bob") + for (size_t i = 0; i < index_readers.size(); i++) { + ASSERT_OK_AND_ASSIGN(auto result, + index_readers[i]->VisitEqual(Literal(FieldType::STRING, "Bob", 3))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{1,2}"); + } + + // Via evaluator: the two indexes' results get AND, still {1,2} + auto scanner_impl = std::dynamic_pointer_cast(global_index_scan); + ASSERT_TRUE(scanner_impl); + ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->GetOrCreateIndexEvaluator()); + { + // Equal predicate + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Bob", 3)); + ASSERT_OK_AND_ASSIGN(auto result, + evaluator->Evaluate(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{1,2}"); + } + { + // NotEqual predicate: both indexes agree on non-null, non-"Bob" rows -> {0,3,4,5,6} + auto predicate = + PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Bob", 3)); + ASSERT_OK_AND_ASSIGN(auto result, + evaluator->Evaluate(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,3,4,5,6}"); + } + { + // IsNull: both agree on row 7 + auto predicate = + PredicateBuilder::IsNull(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING); + ASSERT_OK_AND_ASSIGN(auto result, + evaluator->Evaluate(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{7}"); + } + + // Full pipeline: f0 == "Alice" -> read data + { + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Alice", 5)); + ASSERT_OK_AND_ASSIGN(auto index_result, + scanner_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_TRUE(index_result); + ASSERT_EQ(index_result->ToString(), "{0}"); + + auto result_fields = fields_; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Alice", 10, 1, 11.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, + /*options=*/{}, index_result)); + ASSERT_OK(ReadData(table_path, write_cols, expected_array, /*predicate=*/nullptr, plan)); + } + // Full pipeline with AND across btree(f0) and bitmap(f0): + // btree supports LessOrEqual, bitmap returns nullptr for LessOrEqual + // So AND(LessOrEqual, Equal) -> only the field(s) that both can evaluate get AND + { + // f0 == "Bob" AND f1 == 10 (f1 has no index -> nullptr -> keeps btree+bitmap result) + auto f0_pred = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Bob", 3)); + auto f1_pred = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", + FieldType::INT, Literal(10)); + ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f0_pred, f1_pred})); + ASSERT_OK_AND_ASSIGN(auto index_result, + scanner_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_TRUE(index_result); + ASSERT_EQ(index_result->ToString(), "{1,2}"); + + auto result_fields = fields_; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Bob", 10, 1, 12.1], +[0, "Bob", 25, 1, 16.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, + /*options=*/{}, index_result)); + ASSERT_OK(ReadData(table_path, write_cols, expected_array, /*predicate=*/nullptr, plan)); + } +} + +TEST_P(GlobalIndexTest, TestBTreeScanWithPartitionWithMultiMeta) { + if (file_format_ == "lance" || file_format_ == "avro") { + return; + } + std::string table_path = + paimon::test::GetDataDir() + "/" + file_format_ + + "/append_with_btree_with_partition.db/append_with_btree_with_partition"; + + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + + auto count_rows = [](const std::shared_ptr& result) -> int64_t { + EXPECT_TRUE(result); + EXPECT_OK_AND_ASSIGN(std::vector ranges, result->ToRanges()); + int64_t total = 0; + for (const auto& range : ranges) { + total += range.Count(); + } + return total; + }; + + auto get_reader = [&](const std::string& column) -> std::shared_ptr { + EXPECT_OK_AND_ASSIGN(auto readers, global_index_scan->CreateReaders(column, std::nullopt)); + EXPECT_EQ(readers.size(), 1u); + return readers[0]; + }; + + // ---- col_boolean ---- + { + auto reader = get_reader("col_boolean"); + ASSERT_TRUE(reader); + ASSERT_OK_AND_ASSIGN(auto eq_true, reader->VisitEqual(Literal(true))); + ASSERT_EQ(count_rows(eq_true), 20); + ASSERT_OK_AND_ASSIGN(auto eq_false, reader->VisitEqual(Literal(false))); + ASSERT_EQ(count_rows(eq_false), 20); + } + + // ---- col_int ---- + { + auto reader = get_reader("col_int"); + ASSERT_TRUE(reader); + ASSERT_OK_AND_ASSIGN(auto eq_15, reader->VisitEqual(Literal(15))); + ASSERT_EQ(count_rows(eq_15), 2); + ASSERT_OK_AND_ASSIGN(auto eq_missing, reader->VisitEqual(Literal(100))); + ASSERT_EQ(count_rows(eq_missing), 0); + // GreaterThan(30): i*3 > 30 -> i in [11, 19], 9 indices per partition -> 18 rows. + ASSERT_OK_AND_ASSIGN(auto gt_30, reader->VisitGreaterThan(Literal(30))); + ASSERT_EQ(count_rows(gt_30), 18); + // GreaterThan(57): nothing greater than the max value. + ASSERT_OK_AND_ASSIGN(auto gt_max, reader->VisitGreaterThan(Literal(57))); + ASSERT_EQ(count_rows(gt_max), 0); + } + + // ---- col_date (values are 18000 + i for i in [0,19]) ---- + { + auto reader = get_reader("col_date"); + ASSERT_TRUE(reader); + // 18005 is present at i=5 in both partitions. + ASSERT_OK_AND_ASSIGN(auto eq_present, reader->VisitEqual(Literal(FieldType::DATE, 18005))); + ASSERT_EQ(count_rows(eq_present), 2); + ASSERT_OK_AND_ASSIGN(auto eq_missing, reader->VisitEqual(Literal(FieldType::DATE, 17999))); + ASSERT_EQ(count_rows(eq_missing), 0); + // GreaterThan(18010): i in [11, 19] -> 9 per partition -> 18 rows. + ASSERT_OK_AND_ASSIGN(auto gt_mid, + reader->VisitGreaterThan(Literal(FieldType::DATE, 18010))); + ASSERT_EQ(count_rows(gt_mid), 18); + } + + // ---- col_double (values are i * 2.2 for i in [0,19]) ---- + { + auto reader = get_reader("col_double"); + ASSERT_TRUE(reader); + // i=5 -> 11.0 + ASSERT_OK_AND_ASSIGN(auto eq_present, reader->VisitEqual(Literal(11.0))); + ASSERT_EQ(count_rows(eq_present), 2); + ASSERT_OK_AND_ASSIGN(auto eq_missing, reader->VisitEqual(Literal(123.456))); + ASSERT_EQ(count_rows(eq_missing), 0); + // GreaterThan(10 * 2.2 = 22.0): i in [11, 19] -> 9 per partition -> 18 rows. + ASSERT_OK_AND_ASSIGN(auto gt_mid, reader->VisitGreaterThan(Literal(10 * 2.2))); + ASSERT_EQ(count_rows(gt_mid), 18); + } + + // ---- col_timestamp (Timestamp from epoch millis = 1700000000000 + i*1000) ---- + { + auto reader = get_reader("col_timestamp"); + ASSERT_TRUE(reader); + // i=5 -> 1700000005000 ms. + ASSERT_OK_AND_ASSIGN( + auto eq_present, + reader->VisitEqual(Literal(Timestamp::FromEpochMillis(1700000000000L + 5 * 1000L)))); + ASSERT_EQ(count_rows(eq_present), 2); + ASSERT_OK_AND_ASSIGN(auto eq_missing, + reader->VisitEqual(Literal(Timestamp::FromEpochMillis(1L)))); + ASSERT_EQ(count_rows(eq_missing), 0); + // GreaterThan(i=10 boundary): i in [11, 19] -> 18 rows globally. + ASSERT_OK_AND_ASSIGN(auto gt_mid, + reader->VisitGreaterThan( + Literal(Timestamp::FromEpochMillis(1700000000000L + 10 * 1000L)))); + ASSERT_EQ(count_rows(gt_mid), 18); + } + + // ---- col_timestamp_ltz (same physical values as col_timestamp) ---- + { + auto reader = get_reader("col_timestamp_ltz"); + ASSERT_TRUE(reader); + ASSERT_OK_AND_ASSIGN( + auto eq_present, + reader->VisitEqual(Literal(Timestamp::FromEpochMillis(1700000000000L + 7 * 1000L)))); + ASSERT_EQ(count_rows(eq_present), 2); + ASSERT_OK_AND_ASSIGN(auto gt_mid, + reader->VisitGreaterThan( + Literal(Timestamp::FromEpochMillis(1700000000000L + 10 * 1000L)))); + ASSERT_EQ(count_rows(gt_mid), 18); + } + + // ---- col_decimal (unscaled = i * 123456, precision=18, scale=6) ---- + { + auto reader = get_reader("col_decimal"); + ASSERT_TRUE(reader); + // i=5 -> unscaled 617280 + ASSERT_OK_AND_ASSIGN( + auto eq_present, + reader->VisitEqual(Literal(Decimal::FromUnscaledLong(5 * 123456L, 18, 6)))); + ASSERT_EQ(count_rows(eq_present), 2); + ASSERT_OK_AND_ASSIGN( + auto eq_missing, + reader->VisitEqual(Literal(Decimal::FromUnscaledLong(/*unscaled=*/1L, 18, 6)))); + ASSERT_EQ(count_rows(eq_missing), 0); + // GreaterThan(i=10): i in [11, 19] -> 18 rows globally. + ASSERT_OK_AND_ASSIGN( + auto gt_mid, + reader->VisitGreaterThan(Literal(Decimal::FromUnscaledLong(10 * 123456L, 18, 6)))); + ASSERT_EQ(count_rows(gt_mid), 18); + } + + // ---- col_string (values are "str_00000" .. "str_00019") ---- + { + auto reader = get_reader("col_string"); + ASSERT_TRUE(reader); + std::string present_value = "str_00005"; + ASSERT_OK_AND_ASSIGN(auto eq_present, + reader->VisitEqual(Literal(FieldType::STRING, present_value.data(), + present_value.size()))); + ASSERT_EQ(count_rows(eq_present), 2); + std::string missing_value = "str_99999"; + ASSERT_OK_AND_ASSIGN(auto eq_missing, + reader->VisitEqual(Literal(FieldType::STRING, missing_value.data(), + missing_value.size()))); + ASSERT_EQ(count_rows(eq_missing), 0); + // GreaterThan("str_00010"): lexicographically greater values are i in [11, 19]. + std::string mid_value = "str_00010"; + ASSERT_OK_AND_ASSIGN(auto gt_mid, + reader->VisitGreaterThan( + Literal(FieldType::STRING, mid_value.data(), mid_value.size()))); + ASSERT_EQ(count_rows(gt_mid), 18); + } +} + +#ifdef PAIMON_ENABLE_LUMINA +TEST_P(GlobalIndexTest, TestBTreeWithLumina) { + // Test btree on f0 (string) and lumina on f1 (vector) coexisting on different fields. + arrow::FieldVector fields = { + arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::list(arrow::float32())), + arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}; + std::map lumina_options = {{"lumina.index.dimension", "4"}, + {"lumina.index.type", "bruteforce"}, + {"lumina.distance.metric", "l2"}, + {"lumina.encoding.type", "rawf32"}, + {"lumina.search.parallel_number", "10"}}; + auto schema = arrow::schema(fields); + std::map options = {{Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, file_format_}, + {Options::FILE_SYSTEM, "local"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}}; + CreateTable(/*partition_keys=*/{}, schema, options); + + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + std::vector write_cols = schema->field_names(); + + // Data sorted by f0 for btree: Alice < Alice < Bob < Bob < Emily < Lucy < Paul < Tony + auto src_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ +["Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], +["Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1], +["Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1], +["Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1], +["Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1], +["Lucy", [10.0, 10.0, 10.0, 10.0], 20, 15.1], +["Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1], +["Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1] + ])") + .ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto commit_msgs, WriteArray(table_path, write_cols, src_array)); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // Build btree index on f0 + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "btree", + /*options=*/{}, Range(0, 7))); + // Build lumina index on f1 + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f1", "lumina", + /*options=*/lumina_options, Range(0, 7))); + + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, + /*options=*/lumina_options, fs_, pool_)); + + // Query f0 via btree + { + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + + ASSERT_OK_AND_ASSIGN(auto result, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,1}"); + + ASSERT_OK_AND_ASSIGN( + auto result2, index_readers[0]->VisitLessThan(Literal(FieldType::STRING, "Emily", 5))); + ASSERT_TRUE(result2); + ASSERT_EQ(result2->ToString(), "{0,1,2,3}"); + } + + // Query f1 via lumina (vector search) + { + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f1", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + std::vector query = {11.0f, 11.0f, 11.0f, 11.0f}; + auto vector_search = std::make_shared( + "f1", /*limit=*/1, query, /*filter=*/nullptr, + /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_options); + ASSERT_OK_AND_ASSIGN(auto scored_result, + index_readers[0]->VisitVectorSearch(vector_search)); + ASSERT_TRUE(scored_result); + ASSERT_EQ(scored_result->ToString(), "row ids: {1}, scores: {0.00}"); + } + + // Evaluator: btree on f0 = "Bob" + { + auto scanner_impl = std::dynamic_pointer_cast(global_index_scan); + ASSERT_TRUE(scanner_impl); + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Bob", 3)); + ASSERT_OK_AND_ASSIGN(auto index_result, + scanner_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_TRUE(index_result); + ASSERT_EQ(index_result->ToString(), "{2,3}"); + + // Read data for Bob + auto result_fields = fields; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1], +[0, "Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, + /*options=*/{}, index_result)); + ASSERT_OK(ReadData(table_path, write_cols, expected_array, /*predicate=*/nullptr, plan)); + } + + // Combined: btree f0 filter + lumina vector search with pre-filter + // Use btree result as pre_filter for lumina search + { + ASSERT_OK_AND_ASSIGN(auto btree_readers, + global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(btree_readers.size(), 1u); + // Get rows where f0 == "Alice" -> {0, 1} + ASSERT_OK_AND_ASSIGN(auto btree_result, + btree_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(btree_result); + ASSERT_EQ(btree_result->ToString(), "{0,1}"); + + // Now vector search on f1 with pre_filter limiting to Alice's rows {0, 1} + ASSERT_OK_AND_ASSIGN(auto lumina_readers, + global_index_scan->CreateReaders("f1", std::nullopt)); + ASSERT_EQ(lumina_readers.size(), 1u); + std::vector query = {11.0f, 11.0f, 11.0f, 11.0f}; + auto filter = [](int64_t id) -> bool { return id == 0 || id == 1; }; + auto vector_search = std::make_shared( + "f1", /*limit=*/1, query, filter, + /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_options); + ASSERT_OK_AND_ASSIGN(auto scored_result, + lumina_readers[0]->VisitVectorSearch(vector_search)); + ASSERT_EQ(scored_result->ToString(), "row ids: {1}, scores: {0.00}"); + } +} +#endif + std::vector GetTestValuesForGlobalIndexTest() { std::vector values; values.emplace_back("parquet", false); diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/README b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/README new file mode 100644 index 000000000..7ab7a6619 --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/README @@ -0,0 +1,27 @@ +pt:int +col_boolean:bool +col_int:int +col_date:int +col_double:double +col_timestamp:timestamp +col_timestamp_ltz:timestamp with local timezone +col_decimal:decimal +col_string:string + +pt:partition key +no bucket key +bucket count: -1 +global btree index: col_boolean/col_int/col_date/col_double/col_timestamp/col_timestamp_ltz/col_decimal/col_string + +Msgs: +snapshot-1 +Add:(0, i%2, i * 3, 18000 + i, i * 2.2, 1700000000000L + i * 1000L, 1700000000000L + i * 1000L, Decimal(i * 123456L, 18, 6), "str_0000i") +i = [0, 19] +NoCompact + +snapshot-2 +Add:(1, i%2, i * 3, 18000 + i, i * 2.2, 1700000000000L + i * 1000L, 1700000000000L + i * 1000L, Decimal(i * 123456L, 18, 6), "str_0000i") +i = [0, 19] +NoCompact + +snapshot-3: with global btree index for col_boolean/col_int/col_date/col_double/col_timestamp/col_timestamp_ltz/col_decimal/col_string \ No newline at end of file diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-0323e05c-8903-49a2-9bc0-0a7e5a395211.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-0323e05c-8903-49a2-9bc0-0a7e5a395211.index new file mode 100644 index 0000000000000000000000000000000000000000..e1f90eab95c9e43607683cb1fff063f11726e6a8 GIT binary patch literal 229 zcmd;J$as3bn~{NmiIIupMK+Ah%yA(f#unn(Q4C`R literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-061cad82-2fc6-448e-a3ce-7ac49c836273.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-061cad82-2fc6-448e-a3ce-7ac49c836273.index new file mode 100644 index 0000000000000000000000000000000000000000..9f8be0e9a9643e9782864403ac77657a1723ade1 GIT binary patch literal 176 zcmZPxaARO#ViaTv1k=1ML13DbB^XTeu!MkVewI)$&B77}rny+c!8A8Z1eoSvi3HO^ vEKy)um?auavvM$ifa7tOT8IckDnw-aQpHDLDKsJr%I8HBX9V&+L!1Htf-Mgh literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-1c6c0c2a-b8f0-410e-90bf-26aadf8472af.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-1c6c0c2a-b8f0-410e-90bf-26aadf8472af.index new file mode 100644 index 0000000000000000000000000000000000000000..a51d3c3dc0dbac979cf0e697b8f912d201aa9cec GIT binary patch literal 229 zcmd;JfC45)K@NvUj9?xUBQHn85oRcxljFcLRw$c?gQ1fh%I4>A$mWEySvVSexuI+> zjsyCacU~EQ?8H-?SHjaj6Fg7Pg#wr+_g(F}cjLptr ru?fcJ=v(6TKyZ literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-2525fcef-a890-403a-a587-c9d63329adc6.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-2525fcef-a890-403a-a587-c9d63329adc6.index new file mode 100644 index 0000000000000000000000000000000000000000..f51554f65f9725066444fc30bc8123d0ad51a2f5 GIT binary patch literal 229 zcmd;JfC45)CXQJ%L4eVLiIEw~76h_|IMUNVKnloafQl;u*<2hDt!h9vD^!gZkS)M* z?hFX%0onXe^@cz;AIDj+xG|6|4ApA}WOH+v7=wTXkj=pZHd9DtDJNW%nPDm;FIZTz aH}E&qxhT{uC{GAQl!1W}$oC9!3IG5Rv?^Wz literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-2d75bcb5-1f7d-40b6-9e13-ed11ac16cbb2.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-2d75bcb5-1f7d-40b6-9e13-ed11ac16cbb2.index new file mode 100644 index 0000000000000000000000000000000000000000..b4c3ecbb0d6706b91f9def52444f98896e5d5384 GIT binary patch literal 176 zcmZPxaARO#Vq{_o1k=ncL10>lB^XRIu!MkVE|yR*&B_u6rUh8S!8AWh1eoSyi3HQa vEKy*ZnR literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-476c39a6-b760-45db-9602-a1e6559cfabe.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-476c39a6-b760-45db-9602-a1e6559cfabe.index new file mode 100644 index 0000000000000000000000000000000000000000..31b38f52329140bb8268cdadd9cb3c5378fb2d54 GIT binary patch literal 180 zcmd-`PXhrf2PQ@Vj#)E7zy`=><~Vl-1nhupCJqJ=Xm9|s8K7z$fow(&6JsFQ>;z=9 zahwIKcLuWAIlyKW>cq5}IQb->bSL>81Uh$hYmrV1polt Cu_a9a literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-4c089e9d-f607-41dc-8edc-41e10a951eac.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-4c089e9d-f607-41dc-8edc-41e10a951eac.index new file mode 100644 index 0000000000000000000000000000000000000000..3bc7e15b40becdedd87b336711e479223d022bce GIT binary patch literal 89 zcmZQ%PNCCPsER r$C8PWmzMzqe%_C*MiOSog$Ubzbmj-kqY*_=z7U!?BarVI;uHV?B3~au literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-77eb6c57-c860-47d1-83a9-ec93886e76c9.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-77eb6c57-c860-47d1-83a9-ec93886e76c9.index new file mode 100644 index 0000000000000000000000000000000000000000..ca42db01f8fa14d7b544edd8fcfc5efe05bc4c71 GIT binary patch literal 176 zcmZQ!00JgPK^A5(&C9|Dra4(S!88vG518g>;Rn+!EJ9$Ki$xSnbF)Z*X$}@?FfGI) v2d0Ht6u~qr2asZ5xToA^4iRBUg@~}Oy|Nc9g+^pS`MhZ2j6l9;h*JOn`zQzO literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-7a7e9396-772a-40c2-811c-ddc1f716e1d9.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-7a7e9396-772a-40c2-811c-ddc1f716e1d9.index new file mode 100644 index 0000000000000000000000000000000000000000..e8ff3b77ccbb320c41aac18218f450355bdb42f5 GIT binary patch literal 189 zcmd-qE-8vPFfcGQVqz5FgmH|S7@6T56DCF`ILDNUkpa#zV`5~4bIh3-+29-tCPsER r$C8PWkCy=in1lq*BMCF)LWFHUI`f0&(TE}_UkFW{5yWD9ec7=wTXkj=^iHnYQm`4n7~nPDm;FIZTz aH}E&qxhT{uC{GAQl!1W}$oC9!3IG6CL@OHr literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8b658fde-dc52-4948-8733-7a6b9e2aa58e.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8b658fde-dc52-4948-8733-7a6b9e2aa58e.index new file mode 100644 index 0000000000000000000000000000000000000000..d38a31e038d379f655947c7d46e2c2cbdb3414dc GIT binary patch literal 180 zcmd;hFzxC2Zbk+MCPo2{6SH7!W{wT>U~DFi8H-?S29AbhFg7Dc#wr+_jU!+kjLptr ru?fcJ<6!`SJE}G12vLSqh-lp>!y>RW8j%I%3!;ewUCH1X;uHV?3N0mV literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8b964514-2199-42d4-b2c2-625b461f82f3.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8b964514-2199-42d4-b2c2-625b461f82f3.index new file mode 100644 index 0000000000000000000000000000000000000000..e587bf07d64c55e487e005a45de7925f5a745e73 GIT binary patch literal 242 zcmd-qE-8vP00AaOCQcZ~kcp8Q&M{(Q6oPY%nHU-191|u+E;z@OiIEk~F=JvBfOE{5 z82RBG3noTBILDHSQ5eQC1iFA5&H=iBgO>pWmPmfqK?pN5EM(*Zi)R@h=La*;h-FZ| OFq$|cknb7d6aWBeP$|U# literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8dbf5e09-7023-4f97-8bbb-248d3eec6bed.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8dbf5e09-7023-4f97-8bbb-248d3eec6bed.index new file mode 100644 index 0000000000000000000000000000000000000000..450f92ff471e986a1ca7c133933445a8b2fcc876 GIT binary patch literal 105 zcmZQ%;O1iB=HcaIV`dRzWn^S#;$ma~0zM#S0%AsnV$~c2Mn)zEMOKhF1Am`Y5SW2R Qs6qK$XyPE#JVTrU0NVltcK`qY literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-97cac122-2d88-4b84-88aa-efe022506163.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-97cac122-2d88-4b84-88aa-efe022506163.index new file mode 100644 index 0000000000000000000000000000000000000000..242c6e094c75226e353105600251d7d44ca44d1b GIT binary patch literal 144 zcmZRHabsX$ViaJB1=GwdabTK>B_2#Ouq1$KMwUb{&Bl@hrrBAN!89KS0|?l7njD6R dFt|cQG8a870860}UQj+Snm8kn?-}A0006s+3;qBA literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-9f0d6ea2-e3d9-4877-9c57-85c285a2cd9d.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-9f0d6ea2-e3d9-4877-9c57-85c285a2cd9d.index new file mode 100644 index 0000000000000000000000000000000000000000..e8721269368dee98316f0a5fb4cb966ee8bfddf4 GIT binary patch literal 144 zcmZQkVPIfjViaIe2h+?fT40)qMHfsnuo!@8Miyf*&BkH|rrB96!89KSkYZqXbdkjk eBEsMb5z%{?I|nR)Qvd)6NeB)A literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ae2b3e23-a92a-4688-b6f9-b35117567156.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ae2b3e23-a92a-4688-b6f9-b35117567156.index new file mode 100644 index 0000000000000000000000000000000000000000..88fe380f736a8ed720fa21ff10eafba0850ab38e GIT binary patch literal 105 zcmZQ%;N}u$W?^OHVBqB!WMpJ!5@2Ki0zM#S0%AsnT{50a7#W!u6j?#y4E%joL0|?N Rp$6r1p^1Y`^9*qc000FL1rPuL literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-af76d02b-7f3a-4f6b-b45b-db44c5ff24cc.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-af76d02b-7f3a-4f6b-b45b-db44c5ff24cc.index new file mode 100644 index 0000000000000000000000000000000000000000..129384f37391a47edb8827062c49b64e9e1b788b GIT binary patch literal 180 zcmd;hFzxC2Zbk+MCPp5P6SH7!L5>acU~EQ?8H-?SHjaj6Fg7Pg#wr+_g(F}cjLptr ru?fcJ=v(6TKyZ literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-bd1a00c8-ff45-426d-af34-f7080ed247ba.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-bd1a00c8-ff45-426d-af34-f7080ed247ba.index new file mode 100644 index 0000000000000000000000000000000000000000..88583385377794d3f15ff900bf29165d710dbd2a GIT binary patch literal 89 zcmZQ%AX%8&{XRo}RY11yb3WI_3YXyS}OzGsM2004D34z&OP literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c146d9cb-5e43-4bf8-891b-a05f92cd0563.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c146d9cb-5e43-4bf8-891b-a05f92cd0563.index new file mode 100644 index 0000000000000000000000000000000000000000..92fb4eff3bea37cb04f86dc070fbe2de3c5d29d6 GIT binary patch literal 229 zcmd;J$as3bn~{NmiBXW_MK+Ah%W)we#^&VMQ4C}Aa4aZ?vH3YVs$pyvj)Hm^n~Ni& z8OG-3aA=3IIXE=BVQe7|fqocUnB&JJ7@L)c0R%2OIEceVnHi=s@`8m;wKFDx8EC{T QC|?Lo9Ox|u&k&~o0OG7L-~a#s literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d8503faa-e1d3-474e-a1f1-2e380dec91d3.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d8503faa-e1d3-474e-a1f1-2e380dec91d3.index new file mode 100644 index 0000000000000000000000000000000000000000..4eb313ad7a1838de6326615ae32eb8b1cd80d544 GIT binary patch literal 180 zcmd;Jm@UQt0!)lN91gV-P_`gPL%bA}&B$@UQ3lFp<6uycgR(g}9M}|~Y!;4&7m83e tJI8@j%1|~h514T(c25FAlpz%&s=jd(2Ur@7$b#|((Zm^ne9sW4007TS56l1n literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d99a76c1-c181-49d8-8882-812700de4013.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d99a76c1-c181-49d8-8882-812700de4013.index new file mode 100644 index 0000000000000000000000000000000000000000..3b2c1ea224eabcccf9f8a2f324cbc9bd7932dc19 GIT binary patch literal 242 zcmd-qE-8vP00AaOK~5OQkcp8O&M{(Qi^CBh!AFGSjfl+7SA$1&JSjw5zC-_ OVKi|@Am1~@DF6W1%PRx` literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-da0c9c05-4f11-4e9a-8246-01582ee5e614.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-da0c9c05-4f11-4e9a-8246-01582ee5e614.index new file mode 100644 index 0000000000000000000000000000000000000000..d50b07abcaa5b0834c1a62d624c719bb620c8fb3 GIT binary patch literal 144 zcmZQkVPIfjV&q{_2h)NqT40)yMHfu7u^519P8MS@&B9^^rrB96!89)ikYZr?koP$f eBEsMb5z%{?I|nR)Qvd)#;|R|H literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-e81874df-120e-4124-9517-b8b7fbefc336.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-e81874df-120e-4124-9517-b8b7fbefc336.index new file mode 100644 index 0000000000000000000000000000000000000000..d38a31e038d379f655947c7d46e2c2cbdb3414dc GIT binary patch literal 180 zcmd;hFzxC2Zbk+MCPo2{6SH7!W{wT>U~DFi8H-?S29AbhFg7Dc#wr+_jU!+kjLptr ru?fcJ<6!`SJE}G12vLSqh-lp>!y>RW8j%I%3!;ewUCH1X;uHV?3N0mV literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f35fe6ce-b072-4fd6-a91c-a5d940ae5998.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f35fe6ce-b072-4fd6-a91c-a5d940ae5998.index new file mode 100644 index 0000000000000000000000000000000000000000..a9cc153e6f996be4661b92fd6b1ae4b735a8593b GIT binary patch literal 229 zcmd;JfC45)CJu*3j9?xUBQr;z=9 zaGV9JcLuWAIlyKdBA#KT++Xd5M@Y(hzjhJ)B#JQ5m`{aAeuNMknb7d6aWA& CuqI3Z literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f67895be-6638-449c-9a1b-f2ac7d0efc7d.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f67895be-6638-449c-9a1b-f2ac7d0efc7d.index new file mode 100644 index 0000000000000000000000000000000000000000..21c0c6bb5eda914582cbe0c1801c7463468eea37 GIT binary patch literal 176 zcmZQ!00JgPCKhHe&CJ3EriEBI!88L4518g+;Rn;KEJ9#ffJGEc^Rq~RX+9QdFfGg? v2d24M6u~qH2asZ5(3yEa0wThY3K3yldu1@ODrqO*DFrWNXij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j z%u7cx#?8~mHQvwBmk3qCo>54ua`Mqk3i1z+clHnULvj;_8W&d|SCGeofkEUP;vWxdL7&!|dRJtPP1&kyENyl!Uu0AgDo_I11Fl>Sx<6-KEPf<$XD8(0M5EuBmhB(5# z1CK{YzQoXtB^qOEMZiTza(+r`vQSlVNn&0~Vo?f1j6=)5&%u{oo4wOs|HVR<$KjBW z##a5m3?4^S7;c4o@aZRnw=oN4&wp_6)`5wRT$vHmm3RD3&-F5Tx^l;iR}QCDmiidI z40MvO+_`WU$3$J`Hw;%9&NFCA2Dw$B$x^3LH5k9Yj_rv`k<;;mGHpAk(7o zU`5~|aSnc=UsDc$U6+1A!hhyGU+1!I(Hh_9u5Hm5kb85usDoeTOe_nh*+ac17I6iB zCl8iGf*szsyz0w#{9Mv%{$tChsR4VZrg~=Wm{T_A)IModYp2A;%sT}b9gfS&rGBfp zm9#Wca-Em4>&ma1c^9UzbNdP|ZkQKaax`PI0)vPD z|6B5`c(~{OE14(T+gLRpmtNxX%E_3TImO74+lirMMbPo>OQO=(`mbI1HFE~nZ1Yd{ z&%zftKUEN`-yR$4{nX@YYg@(Mxgorp{kE=ObnN20{IkYOXO`ye&Oa5{8t!#dq3H;R z69?lFM}a#5uVrF?z23U!{gn;gx02NV`|h5y_ozlri2{R_#lk5&g)-%m&2wH@tlbkh zZP&HAhT&a2#j3SaCw;gQ5b8VK%X;s&>?vU@UF8g>8E2JmHn}(>Qg@}lam1vRkDgbq zY?R8C5$9Jd>5|MhJF4-vdsAs(`o8l{Y}^toGg?k$tbXJBEcaTBG4I#K3)j7MetJNE z*_HU+9B+4Od@jm-IyHBr_hP$G6&qfBwaz;IbLrQjt#Q+*aaBd1nmX+qJ9D3e_Sfpj zmE!k*ghj6Q32vTXDZi96^}vP(Wi})8UH5iBO*=Q0_2#O|47)<9b%)E&EMMxLJhwu$ z_}@XN;3E%KH1@nK|99lnw09cbdrb5zzG+XMXT`#_;P8Uru*JWEul<-Z%WnD1FUfbZ z7Vg;>lrqEpuk%UviHXY5dOpFgE`0J+I(iCh%*D?C%Z)1~=cgop$;`HsOpes}yNX># zds>Ey@|d1>69INGCcD$>v9!+xEp4zV6@F+5|r$mYO$U% zpqYg=_qLEiV1t(X90tKA9zz$09)}hoj|>gBDE5$3ClAPd);$#|aO^!;fg8E+ecV}HKXKBN=020+mlbb=&DfeI%|9`PEB$cGU%i&8+M-U3=w%iF D#pJ)2 literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-804b07d3-0855-4246-b778-8bb12a5154dc-0 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-804b07d3-0855-4246-b778-8bb12a5154dc-0 new file mode 100644 index 0000000000000000000000000000000000000000..81af93e827e995c343b8812661a97a9a337ebd8c GIT binary patch literal 2162 zcmeZI%3@>@ODrqO*DFrWNX<>$CtIylQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j-~-QA$ZoODxSP zQL+N*tc|TjPK;nj!2JU;5^gm1Tmnf6@ep6awGRED0L5IeYhb9WpEEEuhx$6X1|iZI zW-0_GG2&GRJA1hLI>vhnLR!|}ip(=`GfG;mWO86TfZAZ3n!Xpl3a z)PZXUg)lzNNO=R83w;p54l@BBX1IbMpXtE(bPe(Y7SjQaAs(21gBGCpOmOoI3Jw8> zGE%yNxs7lThX;8ABZ*2L1C}bWwIbm9F*!daHCd<%m|XHw5{pt8)Ecxer2lk%V=Xwt z?a`)96-<518e8@MGI)GtWl%b$tX1GS+i>MS4o$(d6CDk|N|}1D-SVrhp3N-2Jjfy1 zSy|b2cej+THlJ+E67@If=NGM-)KHP|ztARm!!uJ2B@Iu{#>Vcb?B024dw1_@ODrqO*DFrWNX<>$CtIylQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j-~-QA$ZoODxSP zQL+N*tc|TjPK;nj!2JU;5^gm1Tmnf6@ep6awGRED0L5IeYhb9WpEEEuhx$6X1|iZI zW-0_GG2&GRJA1hLI>vhnLR!|}ip(=`GfG;mWO86TfZAZ3n!Xpl3a z)PZXUg)lzNNO=R83w;p54l@BBX1IbMpXtE(bPe(Y7SjQaAs(21gBGCpOmOoI3Jw8> zGE%yNxs7lThX;8ABZ*2L1C}bWwIbm9F*!daHCd<%m|XHw5{pt89xd3tPT0VCQcm#O z4PhIcT$uhaX>8U1%iyt=mElpRl2*g^Odhqh&NdH3_0KV+q~vt)c@?TC-ah7XCFaY< zPxo`z&R<+$U-;gBXYl;`I=9fH3mPWWKU=OOzque{!IVoKDoN$$UHNb46!SUE;;4Hb zUMFkgz>@RfNaWA(!8O?&z#zf6DOQv30b2&U1P>1zo3?=h(++eG F0RXC~#-acK literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-0 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-0 new file mode 100644 index 0000000000000000000000000000000000000000..0214cefe19ee06a3d1d77eece25ceb7af8ca0ebf GIT binary patch literal 1006 zcmbVLO;3X`7@l!w55|~yZ9FbqHQW$tBo;6O=Ep*s4!RA)$5J|zx$qA>=*=JHFYzCE zlj0mW7M;6M`h4_x-qsr2@3are152gMAJdq;@67?9O(>87qkhT)&;bdF8|!RBeDO17 zBZ(B>V=7HZ7^jI;ij8PO8R1mWharuE49NZYw-WBbuB}^UvAOUH6AtoD1vu9c#HM1v z3X*XaCZi_G*6(VlC|$vru7MRzgBli^cBQu#sX+vt4X{^A*U_CK0=6B+sZ{?c5920r zO+AKg;+}%s`f&#{PO~5B4drG7l<6XbRm(MNr|g4d8bzS!dr?1OjLeE^LQZ2bsl`4h zJMwa`0|GiE(})9EFmo@j56f0BprK%0`=wkF>C9Cxp|b=nUBvJk7PZSDf^N+%d=czH zw4Lp?)WWrEUDqpFrv2)4wb7Nh|08?(LHX+ZG@$UsbBN$r(@TsdvI6^7Y0!6q7N7 z9mtuNa0+WMCehk=IwoH6GvT8;$=^>{eN2KViR&}55sMiof)#W>U?ELm=ji;m5+0zw zt@o^aaqbnw0%o-(;6OJJwp0@>A*tX&JX%ND`tuShOjaqmfbx>riyj#FXa}I z_TBDdDe0n?K$9B)zX}f4Khx9A*+ws?`?}i36XdLZ;1D zv+=<9$HkR&O8O82gTkJIs+mk+`9 GRjZ#;Y-BwE literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-0 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-0 new file mode 100644 index 0000000000000000000000000000000000000000..af2329058bee3fa001c3853687908db301a7833b GIT binary patch literal 1151 zcmeZI%3@>@ODrqO*DFrWNX<=r#aFFVQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3)jdHT4<`#Ji$B9s)D6lLb6W2y@Fj6zbClaHpxFVr{Q(Z$8pB_3pEFv2{n zDqUQCTtkRZ8{ile;u!+;LVR$DV+g`j0)ZDFkEsz$@c4TA#fLfigd#gPsj?)s7{&L# zjuAwt@(cCxiFfu7^+R%Hv{GJaPL2{VyrIFLSX7i)2@IOdlGI#KOhM(9z+n_y8>^#~ zlA4xSnp2`=1=3j?TZVXi*pSxc@~pshjv;mAP(_bnt$(L)r?yQFE3trY?1y5#(n)MTNm;*!L?l*FPG zh6Sv=8w8k+xMvHEID*BHtHMZJz#r`c{?|A*Zc*e2knlQ&TuJ#lK1}1g}2SJDX%vDx1X5Z#{&c2N~0Np_V Dwf24E literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-1 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-1 new file mode 100644 index 0000000000000000000000000000000000000000..56512396f8b7d7f3301dd3cd68faf746905e277f GIT binary patch literal 1006 zcmbVL%}&BF9GAqi2V;DIp2;47+hip!?!(Xxjbv%cR$ycwq$?66JNPudgim0icW<11 z3R|3PGNQzi-(#vwNf@VzvKAZBgfhaZqz^+H1sQ1dm)}ab2fIkOEwNd6g$W0FvjlW>17b@x zVF}4N3zJbDh4i}$DoVF7mTO{F)1Zc>CMxxIA~k40XAkVP+I4hCM8L>Vol^CW@-VIw zchqC*7VfErTRrYz!D;p*y{X>rfwEk~z-`;LDyQs&WEMq0^j*|X7$XZ&O~`31Cspi& zbw^(AH9fgM7oO^qeV+n)FcHl730ND7nG06@zBZ?{tSPB z3rO%A#sMUdICJBMI3tmi(uQ_|(NjMBeE4}@yMOScQoTx^@}!=6hb$srFP%U!oiNyi zoOubSum)oit$n8x;uSv=KCYAe{gBlsB#4r@J`)?Wm~kRlLH7d|(iC?0Pkt-mAsX0v z-^v%~UO_BiR$Bt@>ITA=YN9116+DQ?>nK})SVD!#3dV9xtZEw4aG_~0_0B|U$Ux3H zu=mt~qdR#7Y&)v6RQ*SJ7_Sr0sK?YTJX8(0yxoJTVCf&}P4&S!P?l>LxYu{B(k}O5 zd=!Q-?|WW9;haqKY68KcVp8*UIO}MXjcSlGpB#k(b_-@U%F4sCd%MU~v9A54+#=GB z+uK9V0<>@`hQDS}It)e7jk&q6friM~%yv_1?mFn-FITci`>WH{MpxqgA9<8rncX@s zq0GDdEEOcCgi~<*_WkLX=C$wNs;8eme|+&;dRf_eb@K<<%_N{ytB)cQ2R=)MOk3?% zpBcyVF)0tSR66a9sjh0DuaRBq@JNH!grT?3feV@&)pQrS;4|pfUM&_6YLX zl)^4#)K6IeYcL`4#(Op;ezBRdNu7xG0j*C-7^g{nE;gYFWrWj$J`QOVWU#w`^jirJ z(ZJUGmN;B^g$ajwZ3Vcc8wgvfiB^z|voM)#qHO&^2^FPl7|S)Ws%c2Wg{HmII~J)S z134$a-cbjR?uZE3c2sAj`j7H3-XtDVkEvUDs2Xnhy55Y_?2q)OdhY}%%QXz#>$_Iz zmisU{j3OxdUer$*BQsG=$Z1?mYH<$d9gXr)4F+^f4kHe`1v4Mzy_Nu%<%cmz!ZFRb z(rGlkMmxBwG&-%8(%fxcQ@nP&qjWrPXP2~gTFt;$8ec9+pb7w}h)0qnv68BiC4=kl Pr_aEt&u4B_)(w9G9q401 literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=0/bucket-0/data-0f1a5888-3dd7-4cc9-95f9-9a6cc88e88cf-0.parquet b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=0/bucket-0/data-0f1a5888-3dd7-4cc9-95f9-9a6cc88e88cf-0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2d9ad405e69dca2317753a6b19572a29e5113f65 GIT binary patch literal 3411 zcmds)e@q)y9KgSKy&k0$RtoQS#nTm~&#Hr6S6aX<^H4OJ3?&W{O;}1-S|BSeww)|4 z&Z-H+u`!%ZC!!%}rcN@=h6EO3s^WeSr~I*CGJk}biTq)Vf0(!!HNLx2+R`#f9Q@%+ z^X|QO?|pxK?tS0;wyye7D`Joa6>UQ_0t0~MqfLLAm|{>00%SlX8<7R+5sO%XXXN2n z&4DGUM~0P^*$)BGa)+WUfF~#c(MCJ4dLvJFgprQ3aD5avLgB4gb3JLsjl?QIG>}S9 z9w0XsOo5mw6vb0Oj8LBG>bv&m;4>-#fzix@kN{_}^nSsH z@Av(k*SJSd!KW}pB$Xj48DAAKG=VSCi`&0`JdRJDW$LazP-S`V^Zvc!R5k#fVG}U3 zWOv&9F`{Q_Zji=1 z9V`5)F*Hf#jemwZzYR_q&kbDB{@xgTwm`VTQZlk7j=CR^9jzm^Y(=8C5=YBAzCsF42+E(cUPrP$%m+oi) zpH{bwA5mZ3apJ|h_PpV9P-%!XilDg~iPyw5L4>%GVU)a8LhE{aqj#Nvvbu-+@DtsvyPv;1H2KmcqT@K>N5{QRrs8Jmo`thqC3LtX)!#B_OckjJ+r1eDI zNooIdpyQdxo*xU%@2WgC*g5+Ov|4)fhhuXMj0#jbF}k$+z?1=2<*C>r0;@IFlFd#Z zT?wrD0s%1#uySQs;kI^9DC}zQP&OIi)vLlM4fwL2@Jd&)jN<<7!)IzsBgy%?XUkb=2%}x3#-`aeyN#JRsnqA_6d7!7^$yZn|y; zjivx>Vt_Ukze7q@&D}mE@tAKRL2N)65=5JBh46C`H zY5a!?zr8_NR!1fh#$h|~Y{=c}X?Gz5Nhi#-oQ5>W*#v1O=L8KT)sSDuFV+So)?d7T z%sM+O_=qb=OQt2Au?-rMDv+Vnlf-PWYlVXbK_DyST!KtU4;?Z|CmhUAo|L>X%vz*F zB`8m5l%KK~U0ijHjv4HFWie+l3dVsOnQr~J+-VhZ6}$>r0bY}96R*{MgV*w)#%pt& z;Z-;oxCw!?U=26R6_R87jt*DwiQS%Xsk@`2)D;W{I!#Y{f}ys6-(;;U6U)TX?GIPi XZGZ5QdgB0z=H-46Aem$x`8)LwyywI0 literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=1/bucket-0/data-38231641-4057-4e0d-a2d4-91a7842d2acb-0.parquet b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=1/bucket-0/data-38231641-4057-4e0d-a2d4-91a7842d2acb-0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..003f1d6f5a578d187d71530cbbd3c2708ec5b204 GIT binary patch literal 3412 zcmds)Urbw77{I^#r=>t)ZMSp1#fuiV9_t)%?+UD#WnLUkO*Tgaq7PH(N*OqyUE8_E zY1U;C(J|I6Y?+9LEIK!XiVP8z#A+sCi%wz)Gr9*ifLLmQ ziM$d`ftV=_BQijY(B7%)Tc6Y6&p8Q!)vgDtnCbB~)ol0qn!N57&5SCc9*?u@#o1!4 zWP_%cDCAKOgBmsy=V!Q7Jj=kzbB65N^FhO{hsQdL>|4&S)79L+|FNx47v8woQZW41 z-kX13S$WgK0Oue;Q2|mD@s$!oMV>sU<$QcJZU0@LP2~7vTnx>2@h}7IqnYN&rBGX&VW9cvL0esY z{m9UYOWnH;RJN2}%T9Cs9$9%g*n0Q2z$*`SxW39|D6%9+J)5Yb734{+h>sTH=;@5O zqXz0|ojc^!el+4~FN-_U@njnDG%hts$DE!{QL*8SOt>I6CC~vHh&L`q$_C6yvAk zkz+5tm*X72e5T-o_Ai{v`u6INr;ssd`mU9$qJf41X<=gh9JAyb?L^%!Bqp;ZlreR_ zGBi@Y-P@aDtlD^?m&0FY;jv6foJ0ORTIXXoki_mV3+3EU@VD;|k(AK)HZ)w{+31_e5z-U>Buj~UH$5-&m8pcAMG0RIS$3vT6hjPqa00sd}L!%VY&Kan+U9i zSdW^WI=c{96U!yUPr=HfutJT^-eAbx?AKz8@ao`jw*`-y2(MHX^Ch`0Yol3X`pCzDjqkc`oWyJj zn`?QI5w56~)cB*)Xq!2#G@ThzXUUR(J**SCah}?BROj_HHoKdqNsf@{lW-)5kW5uD zkJ>q-uZ>4Tvq;wRaEygcmfRu`3EGx=7xma;2C32A>uPv2b>q-PwyidhT7gW>&M9_FafZ}xktDK%787Ja>LDVF zj8eNfaZzGq*fWq3twPz-Hu|PwjMJ)PbnH-U(l!eYt7PrIll9hrE1njiSiy@>7T_hh zK=D%DJa{QnHeQ;1h8H0*2*Z-#zy{%}u!7|62mJ29uHD{HzQ^y+cLxH#J+>FTfncMr j#iqClS-Z(;p6>%bQrB0I|5N_}k;}z= literal 0 HcmV?d00001 diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 new file mode 100644 index 000000000..b011496f9 --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 @@ -0,0 +1,50 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "pt", + "type" : "INT" + }, { + "id" : 1, + "name" : "col_boolean", + "type" : "BOOLEAN" + }, { + "id" : 2, + "name" : "col_int", + "type" : "INT" + }, { + "id" : 3, + "name" : "col_date", + "type" : "DATE" + }, { + "id" : 4, + "name" : "col_double", + "type" : "DOUBLE" + }, { + "id" : 5, + "name" : "col_timestamp", + "type" : "TIMESTAMP(3)" + }, { + "id" : 6, + "name" : "col_timestamp_ltz", + "type" : "TIMESTAMP(3) WITH LOCAL TIME ZONE" + }, { + "id" : 7, + "name" : "col_decimal", + "type" : "DECIMAL(18, 6)" + }, { + "id" : 8, + "name" : "col_string", + "type" : "STRING" + } ], + "highestFieldId" : 8, + "partitionKeys" : [ "pt" ], + "primaryKeys" : [ ], + "options" : { + "btree-index.records-per-range" : "10", + "data-evolution.enabled" : "true", + "row-tracking.enabled" : "true" + }, + "timeMillis" : 1777884796113 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST new file mode 100644 index 000000000..56a6051ca --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST new file mode 100644 index 000000000..e440e5c84 --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST @@ -0,0 +1 @@ +3 \ No newline at end of file diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 new file mode 100644 index 000000000..d49f49ac9 --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-1", + "deltaManifestListSize" : 1113, + "commitUser" : "1d8c13bf-d126-47d8-a0dc-35cf4e1ffb42", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1777884797725, + "totalRecordCount" : 20, + "deltaRecordCount" : 20, + "nextRowId" : 20 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 new file mode 100644 index 000000000..13881bf3f --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 2, + "schemaId" : 0, + "baseManifestList" : "manifest-list-bde5901d-912f-40ca-a9c5-fd9d679b4924-0", + "baseManifestListSize" : 1113, + "deltaManifestList" : "manifest-list-bde5901d-912f-40ca-a9c5-fd9d679b4924-1", + "deltaManifestListSize" : 1119, + "commitUser" : "1d8c13bf-d126-47d8-a0dc-35cf4e1ffb42", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1777884797831, + "totalRecordCount" : 40, + "deltaRecordCount" : 20, + "nextRowId" : 40 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 new file mode 100644 index 000000000..af2c32912 --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 @@ -0,0 +1,17 @@ +{ + "version" : 3, + "id" : 3, + "schemaId" : 0, + "baseManifestList" : "manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-0", + "baseManifestListSize" : 1151, + "deltaManifestList" : "manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-1", + "deltaManifestListSize" : 1006, + "indexManifest" : "index-manifest-efa3dce8-fe86-499a-b436-182337edd173-0", + "commitUser" : "cc3110f6-7052-4e18-8b4d-99e6c133a02f", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1777884798405, + "totalRecordCount" : 40, + "deltaRecordCount" : 0, + "nextRowId" : 40 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/README b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/README new file mode 100644 index 000000000..7ab7a6619 --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/README @@ -0,0 +1,27 @@ +pt:int +col_boolean:bool +col_int:int +col_date:int +col_double:double +col_timestamp:timestamp +col_timestamp_ltz:timestamp with local timezone +col_decimal:decimal +col_string:string + +pt:partition key +no bucket key +bucket count: -1 +global btree index: col_boolean/col_int/col_date/col_double/col_timestamp/col_timestamp_ltz/col_decimal/col_string + +Msgs: +snapshot-1 +Add:(0, i%2, i * 3, 18000 + i, i * 2.2, 1700000000000L + i * 1000L, 1700000000000L + i * 1000L, Decimal(i * 123456L, 18, 6), "str_0000i") +i = [0, 19] +NoCompact + +snapshot-2 +Add:(1, i%2, i * 3, 18000 + i, i * 2.2, 1700000000000L + i * 1000L, 1700000000000L + i * 1000L, Decimal(i * 123456L, 18, 6), "str_0000i") +i = [0, 19] +NoCompact + +snapshot-3: with global btree index for col_boolean/col_int/col_date/col_double/col_timestamp/col_timestamp_ltz/col_decimal/col_string \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-03d348f1-3a2b-47b2-ab26-1d894a5bc150.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-03d348f1-3a2b-47b2-ab26-1d894a5bc150.index new file mode 100644 index 0000000000000000000000000000000000000000..129384f37391a47edb8827062c49b64e9e1b788b GIT binary patch literal 180 zcmd;hFzxC2Zbk+MCPp5P6SH7!L5>acU~EQ?8H-?SHjaj6Fg7Pg#wr+_g(F}cjLptr ru?fcJ=v(6TKyZ literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-03da8b2e-7eb5-42c7-9741-6cfa57e3e305.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-03da8b2e-7eb5-42c7-9741-6cfa57e3e305.index new file mode 100644 index 0000000000000000000000000000000000000000..ca42db01f8fa14d7b544edd8fcfc5efe05bc4c71 GIT binary patch literal 176 zcmZQ!00JgPK^A5(&C9|Dra4(S!88vG518g>;Rn+!EJ9$Ki$xSnbF)Z*X$}@?FfGI) v2d0Ht6u~qr2asZ5xToA^4iRBUg@~}Oy|Nc9g+^pS`MhZ2j6l9;h*JOn`zQzO literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-0b4db5e0-03a9-4fa4-ba96-0288a0eb0b19.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-0b4db5e0-03a9-4fa4-ba96-0288a0eb0b19.index new file mode 100644 index 0000000000000000000000000000000000000000..5887cd946cc9891c6adbde4dfab98e5589dcf816 GIT binary patch literal 180 zcmd-`PXhrf2PQ@yj#)E7zy`<`;z=9 zaGV9JcLuWAIlyKdBA#KT++Xd5M@Y(hzjhJ)B#JQ5m`{aAeuNMknb7d6aWA& CuqI3Z literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-10d1c370-e5ad-4530-9935-84e018b7cf8f.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-10d1c370-e5ad-4530-9935-84e018b7cf8f.index new file mode 100644 index 0000000000000000000000000000000000000000..3e97caec2f2d4b2d9ec61e7910a7e233358a576a GIT binary patch literal 229 zcmd;JfC45)L5^87L4eVLiIEq|76h_6InvWWKnlp_fr={v+58+3t!h9v3sj93kj=$$ z?hFX%0omM8^@cz;2gg~kxG|6|1l4N>WD9ec7=wTXkj=^iHnYQm`4n7~nPDm;FIZTz aH}E&qxhT{uC{GAQl!1W}$oC9!3IG6CL@OHr literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-226a2318-32a7-4569-88b1-096c4029e4de.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-226a2318-32a7-4569-88b1-096c4029e4de.index new file mode 100644 index 0000000000000000000000000000000000000000..450f92ff471e986a1ca7c133933445a8b2fcc876 GIT binary patch literal 105 zcmZQ%;O1iB=HcaIV`dRzWn^S#;$ma~0zM#S0%AsnV$~c2Mn)zEMOKhF1Am`Y5SW2R Qs6qK$XyPE#JVTrU0NVltcK`qY literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-336a671f-39fd-4922-b322-ec854712e573.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-336a671f-39fd-4922-b322-ec854712e573.index new file mode 100644 index 0000000000000000000000000000000000000000..129384f37391a47edb8827062c49b64e9e1b788b GIT binary patch literal 180 zcmd;hFzxC2Zbk+MCPp5P6SH7!L5>acU~EQ?8H-?SHjaj6Fg7Pg#wr+_g(F}cjLptr ru?fcJ=v(6TKyZ literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-34391e56-cb42-4e31-9002-8be9162b9d77.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-34391e56-cb42-4e31-9002-8be9162b9d77.index new file mode 100644 index 0000000000000000000000000000000000000000..e8721269368dee98316f0a5fb4cb966ee8bfddf4 GIT binary patch literal 144 zcmZQkVPIfjViaIe2h+?fT40)qMHfsnuo!@8Miyf*&BkH|rrB96!89KSkYZqXbdkjk eBEsMb5z%{?I|nR)Qvd)6NeB)A literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-36d11008-5e75-4801-82da-f5e7c7bc62e0.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-36d11008-5e75-4801-82da-f5e7c7bc62e0.index new file mode 100644 index 0000000000000000000000000000000000000000..88fe380f736a8ed720fa21ff10eafba0850ab38e GIT binary patch literal 105 zcmZQ%;N}u$W?^OHVBqB!WMpJ!5@2Ki0zM#S0%AsnT{50a7#W!u6j?#y4E%joL0|?N Rp$6r1p^1Y`^9*qc000FL1rPuL literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-5279021b-ac4b-4418-9e56-5e8b3ede837b.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-5279021b-ac4b-4418-9e56-5e8b3ede837b.index new file mode 100644 index 0000000000000000000000000000000000000000..88583385377794d3f15ff900bf29165d710dbd2a GIT binary patch literal 89 zcmZQ%AX%8&{XRo}RY11yb3WI_3YXyS}OzGsM2004D34z&OP literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-66e10354-4845-43e2-9838-3ceb058fe2c7.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-66e10354-4845-43e2-9838-3ceb058fe2c7.index new file mode 100644 index 0000000000000000000000000000000000000000..b4c3ecbb0d6706b91f9def52444f98896e5d5384 GIT binary patch literal 176 zcmZPxaARO#Vq{_o1k=ncL10>lB^XRIu!MkVE|yR*&B_u6rUh8S!8AWh1eoSyi3HQa vEKy*Zn<~Vl-1nhupCJqJ=Xm9|s8K7z$fow(&6JsFQ>;z=9 zahwIKcLuWAIlyKW>cq5}IQb->bSL>81Uh$hYmrV1polt Cu_a9a literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6fc58c7a-d306-4da1-a33e-22dfee9390e9.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6fc58c7a-d306-4da1-a33e-22dfee9390e9.index new file mode 100644 index 0000000000000000000000000000000000000000..a9cc153e6f996be4661b92fd6b1ae4b735a8593b GIT binary patch literal 229 zcmd;JfC45)CJu*3j9?xUBQrPNCCPsER r$C8PWmzMzqe%_C*MiOSog$Ubzbmj-kqY*_=z7U!?BarVI;uHV?B3~au literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-86da803a-8554-4201-934c-0a8edf06bb94.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-86da803a-8554-4201-934c-0a8edf06bb94.index new file mode 100644 index 0000000000000000000000000000000000000000..f51554f65f9725066444fc30bc8123d0ad51a2f5 GIT binary patch literal 229 zcmd;JfC45)CXQJ%L4eVLiIEw~76h_|IMUNVKnloafQl;u*<2hDt!h9vD^!gZkS)M* z?hFX%0onXe^@cz;AIDj+xG|6|4ApA}WOH+v7=wTXkj=pZHd9DtDJNW%nPDm;FIZTz aH}E&qxhT{uC{GAQl!1W}$oC9!3IG5Rv?^Wz literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8be14989-0b16-44d6-aedd-0556af02c4b5.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8be14989-0b16-44d6-aedd-0556af02c4b5.index new file mode 100644 index 0000000000000000000000000000000000000000..4eb313ad7a1838de6326615ae32eb8b1cd80d544 GIT binary patch literal 180 zcmd;Jm@UQt0!)lN91gV-P_`gPL%bA}&B$@UQ3lFp<6uycgR(g}9M}|~Y!;4&7m83e tJI8@j%1|~h514T(c25FAlpz%&s=jd(2Ur@7$b#|((Zm^ne9sW4007TS56l1n literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-93f35d27-0323-42d2-9a03-651f4d75b4a6.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-93f35d27-0323-42d2-9a03-651f4d75b4a6.index new file mode 100644 index 0000000000000000000000000000000000000000..242c6e094c75226e353105600251d7d44ca44d1b GIT binary patch literal 144 zcmZRHabsX$ViaJB1=GwdabTK>B_2#Ouq1$KMwUb{&Bl@hrrBAN!89KS0|?l7njD6R dFt|cQG8a870860}UQj+Snm8kn?-}A0006s+3;qBA literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ac97eb16-ec2e-4a32-8366-939956cc32df.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ac97eb16-ec2e-4a32-8366-939956cc32df.index new file mode 100644 index 0000000000000000000000000000000000000000..e1f90eab95c9e43607683cb1fff063f11726e6a8 GIT binary patch literal 229 zcmd;J$as3bn~{NmiIIupMK+Ah%yA(f#unn(Q4C`R literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c02c2b4e-e9b1-437e-82b2-9354c4f2ecfb.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c02c2b4e-e9b1-437e-82b2-9354c4f2ecfb.index new file mode 100644 index 0000000000000000000000000000000000000000..e8ff3b77ccbb320c41aac18218f450355bdb42f5 GIT binary patch literal 189 zcmd-qE-8vPFfcGQVqz5FgmH|S7@6T56DCF`ILDNUkpa#zV`5~4bIh3-+29-tCPsER r$C8PWkCy=in1lq*BMCF)LWFHUI`f0&(TE}_UkFW{5yU~DFi8H-?S29AbhFg7Dc#wr+_jU!+kjLptr ru?fcJ<6!`SJE}G12vLSqh-lp>!y>RW8j%I%3!;ewUCH1X;uHV?3N0mV literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-cc3a2bf4-5a19-46d1-8634-e7e1256026e8.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-cc3a2bf4-5a19-46d1-8634-e7e1256026e8.index new file mode 100644 index 0000000000000000000000000000000000000000..d38a31e038d379f655947c7d46e2c2cbdb3414dc GIT binary patch literal 180 zcmd;hFzxC2Zbk+MCPo2{6SH7!W{wT>U~DFi8H-?S29AbhFg7Dc#wr+_jU!+kjLptr ru?fcJ<6!`SJE}G12vLSqh-lp>!y>RW8j%I%3!;ewUCH1X;uHV?3N0mV literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d02fec1b-f5eb-4890-a7eb-740cbc385f84.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d02fec1b-f5eb-4890-a7eb-740cbc385f84.index new file mode 100644 index 0000000000000000000000000000000000000000..3b2c1ea224eabcccf9f8a2f324cbc9bd7932dc19 GIT binary patch literal 242 zcmd-qE-8vP00AaOK~5OQkcp8O&M{(Qi^CBh!AFGSjfl+7SA$1&JSjw5zC-_ OVKi|@Am1~@DF6W1%PRx` literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d535d249-542f-4bb1-9a69-dc957176497b.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d535d249-542f-4bb1-9a69-dc957176497b.index new file mode 100644 index 0000000000000000000000000000000000000000..3bc7e15b40becdedd87b336711e479223d022bce GIT binary patch literal 89 zcmZQ%pWmPmfqK?pN5EM(*Zi)R@h=La*;h-FZ| OFq$|cknb7d6aWBeP$|U# literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-dd37e0dd-c106-48bd-8ba0-4315b38b2eed.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-dd37e0dd-c106-48bd-8ba0-4315b38b2eed.index new file mode 100644 index 0000000000000000000000000000000000000000..21c0c6bb5eda914582cbe0c1801c7463468eea37 GIT binary patch literal 176 zcmZQ!00JgPCKhHe&CJ3EriEBI!88L4518g+;Rn;KEJ9#ffJGEc^Rq~RX+9QdFfGg? v2d24M6u~qH2asZ5(3yEa0wThY3K3yldu1A$mWEySvVSexuI+> zjsyC)Qvd)#;|R|H literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ff65668f-e4d2-4e69-ab5a-79a777f51650.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ff65668f-e4d2-4e69-ab5a-79a777f51650.index new file mode 100644 index 0000000000000000000000000000000000000000..e1f90eab95c9e43607683cb1fff063f11726e6a8 GIT binary patch literal 229 zcmd;J$as3bn~{NmiIIupMK+Ah%yA(f#unn(Q4C`R literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/index-manifest-7b331606-88e1-4bdd-8426-8c6b88103b42-0 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/index-manifest-7b331606-88e1-4bdd-8426-8c6b88103b42-0 new file mode 100644 index 0000000000000000000000000000000000000000..5fecfb9cc4a80e3fea5c2cca962c1dcba53d378b GIT binary patch literal 2405 zcmeZI%3@>@ODrqO*DFrWNXij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j z%u7cx#?8~mHQvwBmk3qCo>54ua`Mqk3i1z+clHnULvj;_8W&d|SCGeofkEUP;vWxdL7&!|dRJtPP1&kyENyl!Uu0AgDo_I11Fl>Sx<6-KEPf<$XD8(0M5EuBmhB(5# z1CK{YzQoXtB^qOEMZiTza(+r`vQSlVNn&0~Vo?f1lVQvf&&aq#8v;7yKdk=~?XXWs zW2^pO29K>O3|$cse1-v`T(SbMFTA+M(roz5hm)~$hTN`$oSUQH)iO_Yf4H?@cl({7 zQ+DiSLj7&;J#wXQGQMQE#&DKl?bc(Bhgz;hWy`Q#ad4gc%KJ=3{FE!ErnX*{`^~ym zS~&@ADRAVFbl^~Ap3p24!SI0fm2>AVY3G{sWz&y*sXjUHT+-%SaUB~Dne2}4{pk4S zb%5-}i{fFQ7fEco^43wHQXt`|C6DHvrl^k}`Szx~4!rtnmY#;G;I!Sh6`GE4IB_&S zRp3yVAiZksv1Q)AixxZ8&A%ev`fDwh_&ueTh=`U63@s5C95^^HG&FPU5NK=`V3E=G zGGTt`=ydC9@|En>`Ey^aS`!y|mw&aK-E-AhjUPVnd$#^K@bIpZhlVhaSi$tfQLbeI zM~iZIyqCAv*(3+Cf|41h&)KPM-Kt-8;k?t{z}kB&EH>JR%~_<}pQ+={yWTEom9mm; zw#NNaH@D2}XRp+FpdiC7XE-ZU*(SF8ovWez+KcmBD)l-u1w}QtUXLi=rSJQtXhnGP z56ynzXt@oa&R8C8E~*usw$AnC<&8TnS!FMKHyRrUvpXpC-nQyGz4eo?(iiW==_kLu zf1>qzeP!tGz;yd6`b?B&D>c-CD9&FKYYzyc99{=}C(o zC0nZ653 zpKJEEQ*pn{>$91~`?Oo%F1~XzAg`@>WogyAUrXeZ_GVpPcj{H!abPHJ$P1ZsYRZ-P zpqgb7`@^qE?zTI5gZW@{tCh$1DajX-+-G0t4*J!W)h9C3^Dxhxn@P7U)fawF39-6$ zP5E-&MH?Kv9Pdtf_vPDHsa>lU=WDhIIC(UQwJ=mnY@a)|ok!Q!%Kh(+SK*#7 zE9P$B5P0K-L)?mecWz&q`F6Ize!;KREAK98jfg($#KtYbGN<+KjXAHTK2Irmb45CJ z@2owSc6%T0@?CW%@mAN(=c*+sEBw#=c1+u(s>I%NsrC9Lsp9lCm#43+>)xg878SLz zSzJ$Kg#vru(z8oVLf`LNzAyXhipu(&Nar&)5i2I&Src41dwph!-m36jmqHIZv!89{ z*wr5LJc;-F9v`bqOVT^%=C9M6b55C=jiY7p;V+`8dzP-fzx9&f@`ed37+p`YC>=cF zp1s1Vg4|7MB@t7zKU7wue{_E=TUPA3=Pq5^4hQ&V>3FXC-{&?% zcui97cL{&L9~uJRCvIMvT5+~L@>Bc6KOQSCHAXh-HL_Vg*EZQUonxAJ)+zq;2RAw_ zE>w}4a6x~{@ODrqO*DFrWNX<>$CtIylQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j-~-QA$ZoODxSP zQL+N*tc|TjPK;nj!2JU;5^gm1Tmnf6@ep6awGRED0L5IeYhb9WpEEEuhx$6X1|iZI zW-0_GG2&GRJA1hLI>vhnLR!|}ip(=`GfG;mWO86TfZAZ3n!Xpl3a z)PZXUg)lzNNO=R83w;p54l@BBX1IbMpXtE(bPe(Y7SjQaAs(21gBGCpOmOoI3Jw8> zGE%yNxs7lThX;8ABZ*2L1C}bWwIbm9F*!daHCd<%m|XHw5{pt8rm?a)?z?sH%SVOB z>WMxM7EBYEHMZ*iW$^gR%AmxhqCMegB=hkfH6rabC zcXP9y4Q0X{^zNy6%w)2DU;p30duf7DrTyVQ(l)OYSz@+j&gSE2s(yb|=alZ!DM1r6 z`}Y5uT`ttvcyxmB)pyUz8TLOcuqaXLN|W#HG2$}v`npLx literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-ce023896-9468-49d9-8c55-e26271ff81da-0 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-ce023896-9468-49d9-8c55-e26271ff81da-0 new file mode 100644 index 0000000000000000000000000000000000000000..f5d02b99e9f864f429afb6f78e29fff0a827f1e1 GIT binary patch literal 2158 zcmeZI%3@>@ODrqO*DFrWNX<>$CtIylQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3(&d-}QHmJM(W3h@j-~-QA$ZoODxSP zQL+N*tc|TjPK;nj!2JU;5^gm1Tmnf6@ep6awGRED0L5IeYhb9WpEEEuhx$6X1|iZI zW-0_GG2&GRJA1hLI>vhnLR!|}ip(=`GfG;mWO86TfZAZ3n!Xpl3a z)PZXUg)lzNNO=R83w;p54l@BBX1IbMpXtE(bPe(Y7SjQaAs(21gBGCpOmOoI3Jw8> zGE%yNxs7lThX;8ABZ*2L1C}bWwIbm9F*!daHCd<%m|XHw5{pt8mi2C!*4Ur^b<5NJ zil6cu;+fi*HMZ*iW$?Jl%JAr%PyEe_etLgUC%Qw}_xz_LB=f%q-%E|w0^*$q!U2ZMTS3}OSG$fs~ROMJ#ULKyX zh^Ov(c%7z)4^K})!{$%=9i?Wo&raL!;L6S#tWuEEJ+GT5#mVoK`(zvT=$PYM*nO9s ztd*HEak;D3shx5EKPd?_Sh_g0G0HL|8wfmMp6RHT8NeXHxG7eX?*Ur|y95sp8=JO) K0@Dt3UjYDFv&uFA literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-0 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-0 new file mode 100644 index 0000000000000000000000000000000000000000..7eeaeac9c42bb388aa40bc0c1b055631f8c9a991 GIT binary patch literal 1113 zcmbVLJ#W)M7{0h+qywrDD>tAFacI*tP14E4hiYLvD7Hh@5{|2VPL2E#ov)QbYyL|A zLDi)L3j<>1A22fUA1YyoG%-m9Z6<%*^KsAf-gOQ?6pD|@35zPReMAHDYxNZJ(U8Ix zWYmtB1IsWZ!NPYuBzE>QVuK1v-;ZczNW35lD`T+%4Jjj>W^~u1z7xaN!Rd_>?xC)s zbo6v_;^l=LCgmC6rJ^FNOB$L%V$Qs9u!u60{T#}V<}kXYVM&&ej5AGRrgtk+Mk+Gz zfPF1>O~p(jV8fKmnd*Pa!+4Q+OFf#RWSRZ$PA%h@hVOeo?nCAL;$s7@m)Yd&!sUJZMt6F zY&7d)vsQ12wWiY)8}`=HXI_Z|9C2Z5&)n81VKoj!m}0d`TOwG`yKFL G-s(5BwPB+G literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-1 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-1 new file mode 100644 index 0000000000000000000000000000000000000000..8b6b981cb554e79901e9765e74a0279034a03181 GIT binary patch literal 1119 zcmbVLL2uJA7)I2c@wA8}otmT|reZmX+ht0e%JGo8Df|eo z>@V!djnj_&2#y>O`~rl8Q%W1!2}Ttwa_slM=l4FR=bu!n56EktHnQNDCgjWAb10@0 z3OkU~Amt&fLq?Ld?`%SXVln0829fh)+L(|iNm*l7Y)l#DM9_jhif9~Wuyb_&yAmFt zzGL)kxjEO0n1FeG3D`GGgl)}2OGqYo#K!9=$9P#n#pw#h_AIRFI?{1b(^=NL7O5i> zxi`QbYJJymr3LJ`n!Bv}kMJ;FCteeeW!QM2nO^z0{#4NH5BHY#;sz+&Gfmv>d3NcP zN06PyF_d}F<0qVxsf;EPG$}eY--ok|hWW4#Lpma-v4EX|nGf@Fvz+b$vNUYye+jp+ z^wjG}EZqLebT!cxyZ=WX=J#iR zU66(}xObKb!a~ABFn;^7JNkOT-hFrWPrj+Ql@FEL`>h|~;8FpiT78ue7L8~oRR3|% z+V*$aYMbsg)n-e+_L|LYmF(=c=q{nY9O|Rb4;4@Y0953oD2kjZHN~!i_SNJKxb^AI Ilgg^#Pfzb-u>b%7 literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-0 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-0 new file mode 100644 index 0000000000000000000000000000000000000000..dde84a7750d2d2ade95087dcfb0024c63c46750f GIT binary patch literal 1006 zcmbVL&q~8E999s#c@Xg|InO@8ZgGuRY*Sg=DUK4dHrp!wGtx|$(ueQ`MDVnOPqmwm z;K^jpwXQ?wZcX$3$@l#dyLY#{cTDejrsctqCG`E^5s1l%fhOQA$an}EAf-udosVds zer9~2QTaV&+K5I;mTGgc0ZSRDf+>1GVsV&*X7}-1iF8ogF>EY17hX{+K+z}x=Z1+0 zh88Lzx!_ScsG}U?wt|YY6%6()0(BkfL}@yu-dd!NOyq8Wy@GAmaAgGIxX>+C|0s`; zI&n=smVrqJnqKv|y@_D?kMtJ2*#Lz-(L7DF+ij`RwQXNlD_N%f>U6cymAL;S`^C}x)dg9|g2RVg&@`kx+}WQ_-=1H3C$rNT Id3yc8pF0FaKL7v# literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-1 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-1 new file mode 100644 index 0000000000000000000000000000000000000000..d249e084088b50f2bb92669a41f33486338a31f4 GIT binary patch literal 1113 zcmbVLJ#W)M7{0h+gaM)My#aNI(>6^*5+)NLs$1JZu^lR^a9r(kV&sqLd__tl^HUhu z*g&XK`3VfHojNn{ACR!qkeH-`lF1+UeBATAckPqch2lDS!=iF*4`@Js+#5qa8dBJV zjM@=%U0!FtqEXgvGai(d^^=?GUNJZu? zu+OB9shDX5Y?zWcSN%tM7%vlVs7F(D+?7--zul7&r|}=@HRHZx|@OY*}qTKhMi=7#AGw!2nA|!$ za;Sa(EaoJ12y?)vi_dpId_Ne!`#Lz@ODrqO*DFrWNX<=r#aFFVQdy9yWTjM;nw(#hqNJmgmzayeFD^(-1_|aD zrRyaE*%_&N1&Nut`FVO^!_rgpQi~ExQbF3&GE;L>ij}OQt6?U^hq(p?d;0qUC82g@ z=9MVb>L3)jdHT4<`#Ji$B9s)D6lLb6W2y@Fj6zbClaHpxFVr{Q(Z$8pB_3pEFv2{n zDqUQCTtkRZ8{ile;u!+;LVR$DV+g`j0)ZDFkEsz$@c4TA#fLfigd#gPsj?)s7{&L# zjuAwt@(cCxiFfu7^+R%Hv{GJaPL2{VyrIFLSX7i)2@IOdlGI#KOhM(9z+n_y8>^#~ zlA4xSnp2`=1=3j?TZVXi*pSxc@~pshjv;mAP(_bnt$(L)r?yQFE3trY?1y5#(n)MTNm;*!L?l*FPG z2JNGAvf}Nhf=D0P}w_Vy&XY_ri*8JNwes5N^&0u)XC%FF3+(L~K?1rO@!~)BJX+I(MV=`_u3Hwa)mdb$Usj1=lhugS_`pLyq zBgOZOX>$@KJk?6EDNC6kk}3KmVsV&(&S>?mgokk8nvPv;ZoHyYg1lV;dM1L{)-6~; zGAW{T+C;hLV+|Ga9gOW;ST_u4V5R9+dV7%uM9@0`+t&x4=@k*M>*-#l`bT*fH;H@d zu}m8eb>!ENJ6=kb{YY==4+o%ZA0gaze7kna5l9zt42r%t^%Eh;vZyAKEK!r%?1QqS zAP?FgWD~N8CFm$-9_005x!qf6>DV-WDOW|h@w<1>+k&<(YWNL{+GS8dkLDJ>2o51S z&h}Vp;W}{c>y>QNes#Lr=uX`Kk%RoAe07wEjGn(_lB6LK;mNC5FKC~>zwyac>w4Y$ E1T;cJM*si- literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=0/bucket-0/data-b6d91b0d-4fc5-410a-9157-2dd1cf34a15e-0.parquet b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=0/bucket-0/data-b6d91b0d-4fc5-410a-9157-2dd1cf34a15e-0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2d9ad405e69dca2317753a6b19572a29e5113f65 GIT binary patch literal 3411 zcmds)e@q)y9KgSKy&k0$RtoQS#nTm~&#Hr6S6aX<^H4OJ3?&W{O;}1-S|BSeww)|4 z&Z-H+u`!%ZC!!%}rcN@=h6EO3s^WeSr~I*CGJk}biTq)Vf0(!!HNLx2+R`#f9Q@%+ z^X|QO?|pxK?tS0;wyye7D`Joa6>UQ_0t0~MqfLLAm|{>00%SlX8<7R+5sO%XXXN2n z&4DGUM~0P^*$)BGa)+WUfF~#c(MCJ4dLvJFgprQ3aD5avLgB4gb3JLsjl?QIG>}S9 z9w0XsOo5mw6vb0Oj8LBG>bv&m;4>-#fzix@kN{_}^nSsH z@Av(k*SJSd!KW}pB$Xj48DAAKG=VSCi`&0`JdRJDW$LazP-S`V^Zvc!R5k#fVG}U3 zWOv&9F`{Q_Zji=1 z9V`5)F*Hf#jemwZzYR_q&kbDB{@xgTwm`VTQZlk7j=CR^9jzm^Y(=8C5=YBAzCsF42+E(cUPrP$%m+oi) zpH{bwA5mZ3apJ|h_PpV9P-%!XilDg~iPyw5L4>%GVU)a8LhE{aqj#Nvvbu-+@DtsvyPv;1H2KmcqT@K>N5{QRrs8Jmo`thqC3LtX)!#B_OckjJ+r1eDI zNooIdpyQdxo*xU%@2WgC*g5+Ov|4)fhhuXMj0#jbF}k$+z?1=2<*C>r0;@IFlFd#Z zT?wrD0s%1#uySQs;kI^9DC}zQP&OIi)vLlM4fwL2@Jd&)jN<<7!)IzsBgy%?XUkb=2%}x3#-`aeyN#JRsnqA_6d7!7^$yZn|y; zjivx>Vt_Ukze7q@&D}mE@tAKRL2N)65=5JBh46C`H zY5a!?zr8_NR!1fh#$h|~Y{=c}X?Gz5Nhi#-oQ5>W*#v1O=L8KT)sSDuFV+So)?d7T z%sM+O_=qb=OQt2Au?-rMDv+Vnlf-PWYlVXbK_DyST!KtU4;?Z|CmhUAo|L>X%vz*F zB`8m5l%KK~U0ijHjv4HFWie+l3dVsOnQr~J+-VhZ6}$>r0bY}96R*{MgV*w)#%pt& z;Z-;oxCw!?U=26R6_R87jt*DwiQS%Xsk@`2)D;W{I!#Y{f}ys6-(;;U6U)TX?GIPi XZGZ5QdgB0z=H-46Aem$x`8)LwyywI0 literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=1/bucket-0/data-6ab3cbef-417f-47b0-83ee-4145bc3dedb7-0.parquet b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=1/bucket-0/data-6ab3cbef-417f-47b0-83ee-4145bc3dedb7-0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..003f1d6f5a578d187d71530cbbd3c2708ec5b204 GIT binary patch literal 3412 zcmds)Urbw77{I^#r=>t)ZMSp1#fuiV9_t)%?+UD#WnLUkO*Tgaq7PH(N*OqyUE8_E zY1U;C(J|I6Y?+9LEIK!XiVP8z#A+sCi%wz)Gr9*ifLLmQ ziM$d`ftV=_BQijY(B7%)Tc6Y6&p8Q!)vgDtnCbB~)ol0qn!N57&5SCc9*?u@#o1!4 zWP_%cDCAKOgBmsy=V!Q7Jj=kzbB65N^FhO{hsQdL>|4&S)79L+|FNx47v8woQZW41 z-kX13S$WgK0Oue;Q2|mD@s$!oMV>sU<$QcJZU0@LP2~7vTnx>2@h}7IqnYN&rBGX&VW9cvL0esY z{m9UYOWnH;RJN2}%T9Cs9$9%g*n0Q2z$*`SxW39|D6%9+J)5Yb734{+h>sTH=;@5O zqXz0|ojc^!el+4~FN-_U@njnDG%hts$DE!{QL*8SOt>I6CC~vHh&L`q$_C6yvAk zkz+5tm*X72e5T-o_Ai{v`u6INr;ssd`mU9$qJf41X<=gh9JAyb?L^%!Bqp;ZlreR_ zGBi@Y-P@aDtlD^?m&0FY;jv6foJ0ORTIXXoki_mV3+3EU@VD;|k(AK)HZ)w{+31_e5z-U>Buj~UH$5-&m8pcAMG0RIS$3vT6hjPqa00sd}L!%VY&Kan+U9i zSdW^WI=c{96U!yUPr=HfutJT^-eAbx?AKz8@ao`jw*`-y2(MHX^Ch`0Yol3X`pCzDjqkc`oWyJj zn`?QI5w56~)cB*)Xq!2#G@ThzXUUR(J**SCah}?BROj_HHoKdqNsf@{lW-)5kW5uD zkJ>q-uZ>4Tvq;wRaEygcmfRu`3EGx=7xma;2C32A>uPv2b>q-PwyidhT7gW>&M9_FafZ}xktDK%787Ja>LDVF zj8eNfaZzGq*fWq3twPz-Hu|PwjMJ)PbnH-U(l!eYt7PrIll9hrE1njiSiy@>7T_hh zK=D%DJa{QnHeQ;1h8H0*2*Z-#zy{%}u!7|62mJ29uHD{HzQ^y+cLxH#J+>FTfncMr j#iqClS-Z(;p6>%bQrB0I|5N_}k;}z= literal 0 HcmV?d00001 diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 new file mode 100644 index 000000000..1cc1c81de --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 @@ -0,0 +1,50 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "pt", + "type" : "INT" + }, { + "id" : 1, + "name" : "col_boolean", + "type" : "BOOLEAN" + }, { + "id" : 2, + "name" : "col_int", + "type" : "INT" + }, { + "id" : 3, + "name" : "col_date", + "type" : "DATE" + }, { + "id" : 4, + "name" : "col_double", + "type" : "DOUBLE" + }, { + "id" : 5, + "name" : "col_timestamp", + "type" : "TIMESTAMP(3)" + }, { + "id" : 6, + "name" : "col_timestamp_ltz", + "type" : "TIMESTAMP(3) WITH LOCAL TIME ZONE" + }, { + "id" : 7, + "name" : "col_decimal", + "type" : "DECIMAL(18, 6)" + }, { + "id" : 8, + "name" : "col_string", + "type" : "STRING" + } ], + "highestFieldId" : 8, + "partitionKeys" : [ "pt" ], + "primaryKeys" : [ ], + "options" : { + "btree-index.records-per-range" : "10", + "data-evolution.enabled" : "true", + "row-tracking.enabled" : "true" + }, + "timeMillis" : 1777882015920 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST new file mode 100644 index 000000000..56a6051ca --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST new file mode 100644 index 000000000..e440e5c84 --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST @@ -0,0 +1 @@ +3 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 new file mode 100644 index 000000000..18b77828f --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-1", + "deltaManifestListSize" : 1113, + "commitUser" : "b8ba31b4-9a2c-49ac-82c2-4255ed3ef903", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1777882017549, + "totalRecordCount" : 20, + "deltaRecordCount" : 20, + "nextRowId" : 20 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 new file mode 100644 index 000000000..557ade85d --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 2, + "schemaId" : 0, + "baseManifestList" : "manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-0", + "baseManifestListSize" : 1113, + "deltaManifestList" : "manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-1", + "deltaManifestListSize" : 1119, + "commitUser" : "b8ba31b4-9a2c-49ac-82c2-4255ed3ef903", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1777882017651, + "totalRecordCount" : 40, + "deltaRecordCount" : 20, + "nextRowId" : 40 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 new file mode 100644 index 000000000..e597168e6 --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 @@ -0,0 +1,17 @@ +{ + "version" : 3, + "id" : 3, + "schemaId" : 0, + "baseManifestList" : "manifest-list-8b3083fa-2817-477f-beb5-f59f058015b1-0", + "baseManifestListSize" : 1153, + "deltaManifestList" : "manifest-list-8b3083fa-2817-477f-beb5-f59f058015b1-1", + "deltaManifestListSize" : 1006, + "indexManifest" : "index-manifest-7b331606-88e1-4bdd-8426-8c6b88103b42-0", + "commitUser" : "89b4b9f1-f0e5-4b74-b853-324cd44ac62a", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1777882018236, + "totalRecordCount" : 40, + "deltaRecordCount" : 0, + "nextRowId" : 40 +} \ No newline at end of file From a02bbef28ccc601b2dba1cb676b51203a214aad6 Mon Sep 17 00:00:00 2001 From: "lisizhuo.lsz" Date: Thu, 7 May 2026 16:49:13 +0800 Subject: [PATCH 2/2] fix review & add executor for GlobalIndexScan --- include/paimon/defs.h | 3 + .../paimon/global_index/global_index_reader.h | 6 - .../paimon/global_index/global_index_result.h | 2 +- .../paimon/global_index/global_index_scan.h | 19 +- src/paimon/common/defs.cpp | 1 + .../offset_global_index_reader_test.cpp | 46 +- .../union_global_index_reader.cpp | 2 +- .../union_global_index_reader_test.cpp | 80 +++- src/paimon/core/core_options.cpp | 9 + src/paimon/core/core_options.h | 5 +- src/paimon/core/core_options_test.cpp | 3 + .../global_index/global_index_evaluator.h | 12 +- .../global_index_evaluator_impl.cpp | 32 +- .../global_index_evaluator_impl.h | 29 +- .../core/global_index/global_index_scan.cpp | 8 +- .../global_index/global_index_scan_impl.cpp | 38 +- .../global_index/global_index_scan_impl.h | 16 +- .../core/operation/file_store_scan_test.cpp | 54 +++ .../source/data_evolution_batch_scan.cpp | 5 +- test/inte/global_index_test.cpp | 403 ++++++++++-------- 20 files changed, 501 insertions(+), 272 deletions(-) diff --git a/include/paimon/defs.h b/include/paimon/defs.h index 73fd43d1e..7f223a786 100644 --- a/include/paimon/defs.h +++ b/include/paimon/defs.h @@ -365,6 +365,9 @@ struct PAIMON_EXPORT Options { static const char BLOB_AS_DESCRIPTOR[]; /// "global-index.enabled" - Whether to enable global index for scan. Default value is "true". static const char GLOBAL_INDEX_ENABLED[]; + /// "global-index.thread-num" - The maximum number of concurrent scanner for global index. No + /// default value. By default is the number of processors available to the machine. + static const char GLOBAL_INDEX_THREAD_NUM[]; /// "global-index.external-path" - Global index root directory, if not set, the global index /// files will be stored under the index directory. static const char GLOBAL_INDEX_EXTERNAL_PATH[]; diff --git a/include/paimon/global_index/global_index_reader.h b/include/paimon/global_index/global_index_reader.h index 6f76780e6..8338823c0 100644 --- a/include/paimon/global_index/global_index_reader.h +++ b/include/paimon/global_index/global_index_reader.h @@ -32,12 +32,6 @@ namespace paimon { /// Derived classes are expected to implement the visitor methods (e.g., `VisitEqual`, /// `VisitIsNull`, etc.) to return index-based results that indicate which /// rows satisfy the given predicate. -/// -/// @note Leaf implementations of `GlobalIndexReader` (e.g., -/// `BTreeGlobalIndexReader`, `BitmapGlobalIndexReader`) return `GlobalIndexResult` -/// objects containing those **local** row ids. Conversion to **global** row ids is -/// performed by wrapping the leaf reader with `OffsetGlobalIndexReader`, which adds -/// `GlobalIndexMeta.row_range_start` to every row id via `GlobalIndexResult::AddOffset()`. class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor> { public: /// VisitVectorSearch performs approximate vector similarity search. diff --git a/include/paimon/global_index/global_index_result.h b/include/paimon/global_index/global_index_result.h index bff85ac6c..13dbc0a13 100644 --- a/include/paimon/global_index/global_index_result.h +++ b/include/paimon/global_index/global_index_result.h @@ -27,7 +27,7 @@ #include "paimon/visibility.h" namespace paimon { -/// Global index result that holds the row ids selected by a predicate. +/// Global index result that holds the row ids. class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this { public: virtual ~GlobalIndexResult() = default; diff --git a/include/paimon/global_index/global_index_scan.h b/include/paimon/global_index/global_index_scan.h index 205a5ca6a..ff2c976fc 100644 --- a/include/paimon/global_index/global_index_scan.h +++ b/include/paimon/global_index/global_index_scan.h @@ -46,6 +46,8 @@ class PAIMON_EXPORT GlobalIndexScan { /// @param file_system File system for accessing index files. /// If not provided (nullptr), it is inferred from the `FILE_SYSTEM` /// key in the `options` parameter. + /// @param executor The executor to be used for asynchronous operations during global + /// index scan. /// @param pool Memory pool for temporary allocations; if nullptr, uses default. /// @return A `Result` containing a unique pointer to the created scanner, /// or an error if initialization fails (e.g., I/O error, invalid snapshot id, @@ -54,7 +56,8 @@ class PAIMON_EXPORT GlobalIndexScan { const std::string& table_path, const std::optional& snapshot_id, const std::optional>>& partitions, const std::map& options, - const std::shared_ptr& file_system, const std::shared_ptr& pool); + const std::shared_ptr& file_system, const std::shared_ptr& executor, + const std::shared_ptr& pool); /// Creates a `GlobalIndexScan` instance for the specified table and context, with a /// predicate-based partition filter. @@ -66,15 +69,17 @@ class PAIMON_EXPORT GlobalIndexScan { /// @param options User defined configuration. /// @param file_system File system for accessing index files. If nullptr, it is /// inferred from the `FILE_SYSTEM` key in `options`. - /// @param memory_pool Memory pool for temporary allocations; if nullptr, uses default. + /// @param executor The executor to be used for asynchronous operations during global + /// index scan. + /// @param pool Memory pool for temporary allocations; if nullptr, uses default. /// @return A `Result` containing a unique pointer to the created scanner, /// or an error if initialization fails. static Result> Create( const std::string& root_path, const std::optional& snapshot_id, const std::shared_ptr& partition_filters, const std::map& options, - const std::shared_ptr& file_system, - const std::shared_ptr& memory_pool); + const std::shared_ptr& file_system, const std::shared_ptr& executor, + const std::shared_ptr& pool); virtual ~GlobalIndexScan() = default; @@ -83,7 +88,8 @@ class PAIMON_EXPORT GlobalIndexScan { /// @param row_range_index Optional row range that limits the scan to a sub-range of row ids. /// If not provided, the entire row range is considered. /// @return A `Result` that is: - /// - Successful with several readers if the indexes exist and load correctly; + /// - Successful with several readers(with global row id) if the indexes exist and load + /// correctly; /// - Successful with an empty vector if no index was built for the given field; /// - Error returns when loading fails (e.g., file corruption, I/O error, /// unsupported format). @@ -96,7 +102,8 @@ class PAIMON_EXPORT GlobalIndexScan { /// @param row_range_index Optional row range that limits the scan to a sub-range of row ids. /// If not provided, the entire row range is considered. /// @return A `Result` that is: - /// - Successful with several readers if the indexes exist and load correctly; + /// - Successful with several readers(with global row id) if the indexes exist and load + /// correctly; /// - Successful with an empty vector if no index was built for the given field; /// - Error returns when loading fails (e.g., file corruption, I/O error, /// unsupported format). diff --git a/src/paimon/common/defs.cpp b/src/paimon/common/defs.cpp index 11fba2a17..fe7fceb3e 100644 --- a/src/paimon/common/defs.cpp +++ b/src/paimon/common/defs.cpp @@ -91,6 +91,7 @@ const char Options::DATA_EVOLUTION_ENABLED[] = "data-evolution.enabled"; const char Options::PARTITION_GENERATE_LEGACY_NAME[] = "partition.legacy-name"; const char Options::BLOB_AS_DESCRIPTOR[] = "blob-as-descriptor"; const char Options::GLOBAL_INDEX_ENABLED[] = "global-index.enabled"; +const char Options::GLOBAL_INDEX_THREAD_NUM[] = "global-index.thread-num"; const char Options::GLOBAL_INDEX_EXTERNAL_PATH[] = "global-index.external-path"; const char Options::AGGREGATION_REMOVE_RECORD_ON_DELETE[] = "aggregation.remove-record-on-delete"; const char Options::SCAN_TIMESTAMP_MILLIS[] = "scan.timestamp-millis"; diff --git a/src/paimon/common/global_index/offset_global_index_reader_test.cpp b/src/paimon/common/global_index/offset_global_index_reader_test.cpp index 8cee508f1..6eb3499f1 100644 --- a/src/paimon/common/global_index/offset_global_index_reader_test.cpp +++ b/src/paimon/common/global_index/offset_global_index_reader_test.cpp @@ -21,6 +21,7 @@ #include "gtest/gtest.h" #include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/bitmap_scored_global_index_result.h" #include "paimon/predicate/literal.h" #include "paimon/testing/utils/testharness.h" #include "paimon/utils/roaring_bitmap64.h" @@ -32,6 +33,13 @@ class FakeGlobalIndexReader : public GlobalIndexReader { default_result_ = row_ids; } + void SetVectorSearchResult(const std::vector& row_ids, + const std::vector& scores) { + vector_search_row_ids_ = row_ids; + vector_search_scores_ = scores; + has_vector_search_result_ = true; + } + Result> VisitIsNotNull() override { return MakeResult(default_result_); } @@ -93,7 +101,13 @@ class FakeGlobalIndexReader : public GlobalIndexReader { Result> VisitVectorSearch( const std::shared_ptr& vector_search) override { - return Status::Invalid("FakeGlobalIndexReader does not support vector search"); + if (!has_vector_search_result_) { + return Status::Invalid("FakeGlobalIndexReader does not support vector search"); + } + auto bitmap = RoaringBitmap64::From(vector_search_row_ids_); + auto scores = vector_search_scores_; + return std::make_shared(std::move(bitmap), + std::move(scores)); } Result> VisitFullTextSearch( @@ -119,6 +133,9 @@ class FakeGlobalIndexReader : public GlobalIndexReader { private: std::vector default_result_; + std::vector vector_search_row_ids_; + std::vector vector_search_scores_; + bool has_vector_search_result_ = false; }; class OffsetGlobalIndexReaderTest : public ::testing::Test { @@ -134,6 +151,20 @@ class OffsetGlobalIndexReaderTest : public ::testing::Test { << "result=" << bitmap->ToString() << ", expected=" << RoaringBitmap64::From(expected).ToString(); } + + static void CheckScoredResult(const std::shared_ptr& result, + const std::vector& expected_row_ids, + const std::vector& expected_scores) { + ASSERT_TRUE(result); + auto typed_result = std::dynamic_pointer_cast(result); + ASSERT_TRUE(typed_result); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, typed_result->GetBitmap()); + ASSERT_TRUE(bitmap); + ASSERT_EQ(*bitmap, RoaringBitmap64::From(expected_row_ids)) + << "result=" << bitmap->ToString() + << ", expected=" << RoaringBitmap64::From(expected_row_ids).ToString(); + ASSERT_EQ(typed_result->GetScores(), expected_scores); + } }; TEST_F(OffsetGlobalIndexReaderTest, TestVisitEqualWithOffset) { @@ -300,10 +331,21 @@ TEST_F(OffsetGlobalIndexReaderTest, TestVisitFullTextSearchWithOffset) { CheckResult(result, {10, 13, 15}); } +TEST_F(OffsetGlobalIndexReaderTest, TestVisitVectorSearchWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetVectorSearchResult({0, 2, 5}, {0.9f, 0.7f, 0.3f}); + + auto offset_reader = std::make_shared(fake_reader, 100); + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitVectorSearch(nullptr)); + // row ids {0, 2, 5} + offset 100 -> {100, 102, 105}, scores unchanged + CheckScoredResult(result, {100, 102, 105}, {0.9f, 0.7f, 0.3f}); +} + TEST_F(OffsetGlobalIndexReaderTest, TestVisitVectorSearchNotSupported) { auto fake_reader = std::make_shared(); auto offset_reader = std::make_shared(fake_reader, 10); - // FakeGlobalIndexReader returns error for VectorSearch + // FakeGlobalIndexReader without SetVectorSearchResult returns error for VectorSearch ASSERT_NOK_WITH_MSG(offset_reader->VisitVectorSearch(nullptr), "FakeGlobalIndexReader does not support vector search"); } diff --git a/src/paimon/common/global_index/union_global_index_reader.cpp b/src/paimon/common/global_index/union_global_index_reader.cpp index 6108dcec9..d5133e945 100644 --- a/src/paimon/common/global_index/union_global_index_reader.cpp +++ b/src/paimon/common/global_index/union_global_index_reader.cpp @@ -185,7 +185,7 @@ Result> UnionGlobalIndexReader::Union(ReaderA template std::vector UnionGlobalIndexReader::ExecuteAllReaders( const std::function&)>& action) { - if (executor_ == nullptr) { + if (executor_ == nullptr || readers_.size() == 1) { std::vector results; results.reserve(readers_.size()); for (const auto& reader : readers_) { diff --git a/src/paimon/common/global_index/union_global_index_reader_test.cpp b/src/paimon/common/global_index/union_global_index_reader_test.cpp index f6f77d464..98ecfdc76 100644 --- a/src/paimon/common/global_index/union_global_index_reader_test.cpp +++ b/src/paimon/common/global_index/union_global_index_reader_test.cpp @@ -25,6 +25,7 @@ #include "gtest/gtest.h" #include "paimon/executor.h" #include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/bitmap_scored_global_index_result.h" #include "paimon/predicate/literal.h" #include "paimon/testing/utils/testharness.h" #include "paimon/utils/roaring_bitmap64.h" @@ -53,6 +54,13 @@ class FakeReader : public GlobalIndexReader { error_message_ = message; } + /// Sets a scored result returned by VisitVectorSearch. + void SetScoredResult(const std::vector& row_ids, const std::vector& scores) { + scored_row_ids_ = row_ids; + scored_scores_ = scores; + has_scored_result_ = true; + } + /// Counts how many times any Visit* method was invoked. Useful to assert all readers /// are exercised by UnionGlobalIndexReader. int InvocationCount() const { @@ -124,7 +132,13 @@ class FakeReader : public GlobalIndexReader { if (return_error_) { return Status::Invalid(error_message_); } - return std::shared_ptr(nullptr); + if (!has_scored_result_) { + return std::shared_ptr(nullptr); + } + auto bitmap = RoaringBitmap64::From(scored_row_ids_); + auto scores = scored_scores_; + return std::make_shared(std::move(bitmap), + std::move(scores)); } Result> VisitFullTextSearch( @@ -159,7 +173,10 @@ class FakeReader : public GlobalIndexReader { bool return_nullptr_ = false; bool return_error_ = false; std::string error_message_; - std::atomic invocation_count_{0}; + std::vector scored_row_ids_; + std::vector scored_scores_; + bool has_scored_result_ = false; + std::atomic invocation_count_{0}; }; class UnionGlobalIndexReaderTest : public ::testing::Test { @@ -175,6 +192,20 @@ class UnionGlobalIndexReaderTest : public ::testing::Test { << "result=" << bitmap->ToString() << ", expected=" << RoaringBitmap64::From(expected).ToString(); } + + static void CheckScoredResult(const std::shared_ptr& result, + const std::vector& expected_row_ids, + const std::vector& expected_scores) { + ASSERT_TRUE(result); + auto typed_result = std::dynamic_pointer_cast(result); + ASSERT_TRUE(typed_result); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, typed_result->GetBitmap()); + ASSERT_TRUE(bitmap); + ASSERT_EQ(*bitmap, RoaringBitmap64::From(expected_row_ids)) + << "result=" << bitmap->ToString() + << ", expected=" << RoaringBitmap64::From(expected_row_ids).ToString(); + ASSERT_EQ(typed_result->GetScores(), expected_scores); + } }; TEST_F(UnionGlobalIndexReaderTest, TestSingleReaderUnion) { @@ -419,6 +450,7 @@ TEST_F(UnionGlobalIndexReaderTest, TestVisitFullTextSearchUnion) { TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchAllNullptr) { auto reader1 = std::make_shared(); auto reader2 = std::make_shared(); + // Neither reader has SetScoredResult -> VisitVectorSearch returns nullptr reader1->SetDefaultResult({1}); reader2->SetDefaultResult({2}); @@ -429,10 +461,52 @@ TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchAllNullptr) { ASSERT_FALSE(result); } +TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchSingleReader) { + auto reader = std::make_shared(); + reader->SetScoredResult({1, 3, 5}, {0.9f, 0.7f, 0.5f}); + + std::vector> readers = {reader}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitVectorSearch(nullptr)); + CheckScoredResult(result, {1, 3, 5}, {0.9f, 0.7f, 0.5f}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchMultipleReadersUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetScoredResult({1, 3}, {0.9f, 0.7f}); + reader2->SetScoredResult({2, 4}, {0.8f, 0.6f}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitVectorSearch(nullptr)); + // {1,3} OR {2,4} -> {1,2,3,4}, scores merged in row id order + CheckScoredResult(result, {1, 2, 3, 4}, {0.9f, 0.8f, 0.7f, 0.6f}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchPartialNullptr) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + auto reader3 = std::make_shared(); + reader1->SetScoredResult({1, 2}, {0.9f, 0.8f}); + // reader2 has no scored result -> returns nullptr + reader2->SetDefaultResult({10}); + reader3->SetScoredResult({5, 6}, {0.5f, 0.4f}); + + std::vector> readers = {reader1, reader2, reader3}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitVectorSearch(nullptr)); + // reader2 nullptr is skipped, {1,2} OR {5,6} -> {1,2,5,6} + CheckScoredResult(result, {1, 2, 5, 6}, {0.9f, 0.8f, 0.5f, 0.4f}); +} + TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchErrorPropagation) { auto reader1 = std::make_shared(); auto reader2 = std::make_shared(); - reader1->SetDefaultResult({1}); + reader1->SetScoredResult({1}, {0.9f}); reader2->SetReturnError("vector search failure"); std::vector> readers = {reader1, reader2}; diff --git a/src/paimon/core/core_options.cpp b/src/paimon/core/core_options.cpp index b00f9a99f..c90a8dd9e 100644 --- a/src/paimon/core/core_options.cpp +++ b/src/paimon/core/core_options.cpp @@ -431,6 +431,7 @@ struct CoreOptions::Impl { bool data_evolution_enabled = false; bool legacy_partition_name_enabled = true; bool global_index_enabled = true; + std::optional global_index_thread_num; bool commit_force_compact = false; bool compaction_force_rewrite_all_files = false; bool compaction_force_up_level_0 = false; @@ -688,6 +689,10 @@ struct CoreOptions::Impl { // Parse global-index.enabled - whether to enable global index for scan, default true PAIMON_RETURN_NOT_OK( parser.Parse(Options::GLOBAL_INDEX_ENABLED, &global_index_enabled)); + // Parse global-index.thread-num - the maximum number of concurrent scanner for global + // index, no default value + PAIMON_RETURN_NOT_OK( + parser.Parse(Options::GLOBAL_INDEX_THREAD_NUM, &global_index_thread_num)); // Parse global-index.external-path - global index root directory PAIMON_RETURN_NOT_OK( parser.Parse(Options::GLOBAL_INDEX_EXTERNAL_PATH, &global_index_external_path)); @@ -1270,6 +1275,10 @@ bool CoreOptions::GlobalIndexEnabled() const { return impl_->global_index_enabled; } +std::optional CoreOptions::GetGlobalIndexThreadNum() const { + return impl_->global_index_thread_num; +} + std::optional CoreOptions::GetGlobalIndexExternalPath() const { return impl_->global_index_external_path; } diff --git a/src/paimon/core/core_options.h b/src/paimon/core/core_options.h index e50176a4b..1f15e3fd5 100644 --- a/src/paimon/core/core_options.h +++ b/src/paimon/core/core_options.h @@ -174,9 +174,10 @@ class PAIMON_EXPORT CoreOptions { int64_t GetLookupCacheFileRetentionMs() const; int64_t GetLookupCacheMaxDiskSize() const; - const std::map& ToMap() const; - BucketFunctionType GetBucketFunctionType() const; + std::optional GetGlobalIndexThreadNum() const; + + const std::map& ToMap() const; private: std::optional GetDataFileExternalPaths() const; diff --git a/src/paimon/core/core_options_test.cpp b/src/paimon/core/core_options_test.cpp index c0f63f721..5802e6ee6 100644 --- a/src/paimon/core/core_options_test.cpp +++ b/src/paimon/core/core_options_test.cpp @@ -115,6 +115,7 @@ TEST(CoreOptionsTest, TestDefaultValue) { ASSERT_TRUE(core_options.LegacyPartitionNameEnabled()); ASSERT_TRUE(core_options.GlobalIndexEnabled()); ASSERT_EQ(std::nullopt, core_options.GetGlobalIndexExternalPath()); + ASSERT_EQ(std::nullopt, core_options.GetGlobalIndexThreadNum()); ASSERT_EQ(std::nullopt, core_options.GetScanTagName()); ASSERT_EQ(std::nullopt, core_options.GetOptimizedCompactionInterval()); ASSERT_EQ(std::nullopt, core_options.GetCompactionTotalSizeThreshold()); @@ -209,6 +210,7 @@ TEST(CoreOptionsTest, TestFromMap) { {Options::DATA_EVOLUTION_ENABLED, "true"}, {Options::PARTITION_GENERATE_LEGACY_NAME, "false"}, {Options::GLOBAL_INDEX_ENABLED, "false"}, + {Options::GLOBAL_INDEX_THREAD_NUM, "4"}, {Options::GLOBAL_INDEX_EXTERNAL_PATH, "FILE:///tmp/global_index/"}, {Options::SCAN_TAG_NAME, "test-tag"}, {Options::WRITE_ONLY, "true"}, @@ -333,6 +335,7 @@ TEST(CoreOptionsTest, TestFromMap) { ASSERT_TRUE(core_options.DataEvolutionEnabled()); ASSERT_FALSE(core_options.LegacyPartitionNameEnabled()); ASSERT_FALSE(core_options.GlobalIndexEnabled()); + ASSERT_EQ(core_options.GetGlobalIndexThreadNum(), 4); ASSERT_TRUE(core_options.GetGlobalIndexExternalPath()); ASSERT_EQ(core_options.GetGlobalIndexExternalPath().value(), "FILE:///tmp/global_index/"); ASSERT_EQ("test-tag", core_options.GetScanTagName().value()); diff --git a/src/paimon/core/global_index/global_index_evaluator.h b/src/paimon/core/global_index/global_index_evaluator.h index 0a624b1d1..856198782 100644 --- a/src/paimon/core/global_index/global_index_evaluator.h +++ b/src/paimon/core/global_index/global_index_evaluator.h @@ -21,8 +21,6 @@ #include "paimon/global_index/global_index_result.h" #include "paimon/predicate/predicate.h" -#include "paimon/predicate/vector_search.h" -#include "paimon/utils/row_range_index.h" #include "paimon/visibility.h" namespace paimon { @@ -33,19 +31,13 @@ class PAIMON_EXPORT GlobalIndexEvaluator { /// Evaluates a predicate against the global index. /// /// @param predicate The filter predicate to evaluate. - /// @param row_range_index Optional row range that limits evaluation to the given - /// ranges of row ids. Index files whose row range does not - /// intersect with `row_range_index` will be skipped. If a field has - /// no usable index file in the requested range, the evaluator - /// returns `nullptr` for that field. /// @return A `Result` containing: /// - `nullptr` if the predicate cannot be evaluated by this index (e.g., field has - /// no index, or no index file intersects with `row_range_index`), + /// no index), /// - A `std::shared_ptr` if evaluation succeeds. /// The `GlobalIndexResult` indicates the matching rows (e.g., via row ID bitmaps). virtual Result> Evaluate( - const std::shared_ptr& predicate, - const std::optional& row_range_index) = 0; + const std::shared_ptr& predicate) = 0; }; } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_evaluator_impl.cpp b/src/paimon/core/global_index/global_index_evaluator_impl.cpp index a0ffcf8b8..bb123acd0 100644 --- a/src/paimon/core/global_index/global_index_evaluator_impl.cpp +++ b/src/paimon/core/global_index/global_index_evaluator_impl.cpp @@ -23,47 +23,42 @@ namespace paimon { Result> GlobalIndexEvaluatorImpl::Evaluate( - const std::shared_ptr& predicate, - const std::optional& row_range_index) { + const std::shared_ptr& predicate) { std::shared_ptr compound_result; if (predicate) { - ReadersCache cache; - PAIMON_ASSIGN_OR_RAISE(compound_result, - EvaluatePredicate(predicate, row_range_index, cache)); + PAIMON_ASSIGN_OR_RAISE(compound_result, EvaluatePredicate(predicate)); } return compound_result; } Result>> GlobalIndexEvaluatorImpl::GetIndexReaders( - const std::string& field_name, const std::optional& row_range_index, - ReadersCache& cache) { + const std::string& field_name) { PAIMON_ASSIGN_OR_RAISE(DataField data_field, table_schema_->GetField(field_name)); int32_t field_id = data_field.Id(); // get or create global index readers for current field std::vector> readers; - auto iter = cache.find(field_id); - if (iter != cache.end()) { + auto iter = index_readers_cache_.find(field_id); + if (iter != index_readers_cache_.end()) { readers = iter->second; } else { - PAIMON_ASSIGN_OR_RAISE(readers, create_index_readers_(field_id, row_range_index)); - cache.insert({field_id, readers}); + PAIMON_ASSIGN_OR_RAISE(readers, create_index_readers_(field_id)); + index_readers_cache_.insert({field_id, readers}); } return readers; } Result> GlobalIndexEvaluatorImpl::EvaluatePredicate( - const std::shared_ptr& predicate, - const std::optional& row_range_index, ReadersCache& cache) { + const std::shared_ptr& predicate) { if (predicate == nullptr) { return std::shared_ptr(nullptr); } if (auto compound_predicate = std::dynamic_pointer_cast(predicate)) { - return EvaluateCompoundPredicate(compound_predicate, row_range_index, cache); + return EvaluateCompoundPredicate(compound_predicate); } else if (auto leaf_predicate = std::dynamic_pointer_cast(predicate)) { const std::string& field_name = leaf_predicate->FieldName(); PAIMON_ASSIGN_OR_RAISE(std::vector> readers, - GetIndexReaders(field_name, row_range_index, cache)); + GetIndexReaders(field_name)); if (readers.empty()) { // No usable index for this field within the requested range. Treat as "no // pushdown available" so the upstream falls back to a full scan instead of @@ -100,13 +95,12 @@ Result> GlobalIndexEvaluatorImpl::EvaluatePre } Result> GlobalIndexEvaluatorImpl::EvaluateCompoundPredicate( - const std::shared_ptr& compound_predicate, - const std::optional& row_range_index, ReadersCache& cache) { + const std::shared_ptr& compound_predicate) { if (compound_predicate->GetFunction().GetType() == Function::Type::OR) { std::shared_ptr compound_result; for (const auto& child : compound_predicate->Children()) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr sub_result, - EvaluatePredicate(child, row_range_index, cache)); + EvaluatePredicate(child)); if (!sub_result) { return std::shared_ptr(nullptr); } @@ -123,7 +117,7 @@ Result> GlobalIndexEvaluatorImpl::EvaluateCom std::shared_ptr compound_result; for (const auto& child : compound_predicate->Children()) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr sub_result, - EvaluatePredicate(child, row_range_index, cache)); + EvaluatePredicate(child)); if (sub_result) { if (!compound_result) { compound_result = sub_result; diff --git a/src/paimon/core/global_index/global_index_evaluator_impl.h b/src/paimon/core/global_index/global_index_evaluator_impl.h index 21b14cd8e..7cae89193 100644 --- a/src/paimon/core/global_index/global_index_evaluator_impl.h +++ b/src/paimon/core/global_index/global_index_evaluator_impl.h @@ -27,47 +27,38 @@ #include "paimon/core/schema/table_schema.h" #include "paimon/global_index/global_index_reader.h" #include "paimon/predicate/compound_predicate.h" -#include "paimon/utils/row_range_index.h" namespace paimon { class GlobalIndexEvaluatorImpl : public GlobalIndexEvaluator { public: - /// Creates the underlying readers for the given field, optionally restricted to the - /// provided row range. Returns an empty vector when the field has no usable index. + /// Creates the underlying readers for the given field. Returns an empty vector when the field + /// has no usable index. using IndexReadersCreator = - std::function>>( - int32_t, const std::optional&)>; + std::function>>(int32_t)>; GlobalIndexEvaluatorImpl(const std::shared_ptr& table_schema, IndexReadersCreator create_index_readers) : table_schema_(table_schema), create_index_readers_(std::move(create_index_readers)) {} Result> Evaluate( - const std::shared_ptr& predicate, - const std::optional& row_range_index) override; + const std::shared_ptr& predicate) override; private: - /// Per-evaluation cache keyed by field id. Reused across recursive calls within a single - /// `Evaluate` invocation so that the same field is not loaded twice; a fresh cache is used - /// for every public `Evaluate` call because `row_range_index` may change between calls. - using ReadersCache = std::map>>; - Result> EvaluatePredicate( - const std::shared_ptr& predicate, - const std::optional& row_range_index, ReadersCache& cache); + const std::shared_ptr& predicate); Result> EvaluateCompoundPredicate( - const std::shared_ptr& compound_predicate, - const std::optional& row_range_index, ReadersCache& cache); + const std::shared_ptr& compound_predicate); Result>> GetIndexReaders( - const std::string& field_name, const std::optional& row_range_index, - ReadersCache& cache); + const std::string& field_name); private: std::shared_ptr table_schema_; - // create_index_readers_(field_id, row_range_index) + // create_index_readers_(field_id) IndexReadersCreator create_index_readers_; + // [field_id, vector] + std::map>> index_readers_cache_; }; } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_scan.cpp b/src/paimon/core/global_index/global_index_scan.cpp index 309a4b605..aa54a3a6e 100644 --- a/src/paimon/core/global_index/global_index_scan.cpp +++ b/src/paimon/core/global_index/global_index_scan.cpp @@ -69,7 +69,7 @@ Result> GlobalIndexScan::Create( const std::string& root_path, const std::optional& snapshot_id, const std::optional>>& partitions, const std::map& options, - const std::shared_ptr& file_system, + const std::shared_ptr& file_system, const std::shared_ptr& executor, const std::shared_ptr& memory_pool) { if (partitions && partitions.value().empty()) { return Status::Invalid( @@ -90,13 +90,13 @@ Result> GlobalIndexScan::Create( } PAIMON_ASSIGN_OR_RAISE(Snapshot snapshot, LoadSnapshot(root_path, snapshot_id, core_options)); return GlobalIndexScanImpl::Create(root_path, table_schema, snapshot, partition_filters, - core_options, pool); + core_options, executor, pool); } Result> GlobalIndexScan::Create( const std::string& root_path, const std::optional& snapshot_id, const std::shared_ptr& partitions, const std::map& options, - const std::shared_ptr& file_system, + const std::shared_ptr& file_system, const std::shared_ptr& executor, const std::shared_ptr& memory_pool) { std::shared_ptr partition_filters; if (partitions) { @@ -112,7 +112,7 @@ Result> GlobalIndexScan::Create( MergeOptions(table_schema, options, file_system)); PAIMON_ASSIGN_OR_RAISE(Snapshot snapshot, LoadSnapshot(root_path, snapshot_id, core_options)); return GlobalIndexScanImpl::Create(root_path, table_schema, snapshot, partition_filters, - core_options, pool); + core_options, executor, pool); } } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_scan_impl.cpp b/src/paimon/core/global_index/global_index_scan_impl.cpp index 1277fc8cb..de7ce1b83 100644 --- a/src/paimon/core/global_index/global_index_scan_impl.cpp +++ b/src/paimon/core/global_index/global_index_scan_impl.cpp @@ -16,10 +16,13 @@ #include "paimon/core/global_index/global_index_scan_impl.h" #include +#include #include +#include "arrow/c/bridge.h" #include "paimon/common/global_index/offset_global_index_reader.h" #include "paimon/common/global_index/union_global_index_reader.h" +#include "paimon/common/utils/scope_guard.h" #include "paimon/core/global_index/global_index_evaluator_impl.h" #include "paimon/core/index/index_file_handler.h" #include "paimon/global_index/bitmap_global_index_result.h" @@ -31,18 +34,21 @@ GlobalIndexScanImpl::GlobalIndexScanImpl(const std::shared_ptr& tab const CoreOptions& options, const std::shared_ptr& path_factory, IndexMetaMap&& index_metas, + const std::shared_ptr& executor, const std::shared_ptr& pool) : pool_(pool), table_schema_(table_schema), options_(options), index_file_manager_( std::make_shared(options.GetFileSystem(), path_factory)), - index_metas_(std::move(index_metas)) {} + index_metas_(std::move(index_metas)), + executor_(executor) {} Result> GlobalIndexScanImpl::Create( const std::string& root_path, const std::shared_ptr& table_schema, const Snapshot& snapshot, const std::shared_ptr& partitions, - const CoreOptions& options, const std::shared_ptr& pool) { + const CoreOptions& options, const std::shared_ptr& executor, + const std::shared_ptr& pool) { auto arrow_schema = DataField::ConvertDataFieldsToArrowSchema(table_schema->Fields()); PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, options.CreateExternalPaths()); PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, @@ -94,8 +100,17 @@ Result> GlobalIndexScanImpl::Create( index_metas[index_meta->index_field_id][index_file_meta->IndexType()][range].push_back( index_file_meta); } - return std::unique_ptr( - new GlobalIndexScanImpl(table_schema, options, path_factory, std::move(index_metas), pool)); + auto final_executor = executor; + if (!final_executor) { + std::optional thread_num = options.GetGlobalIndexThreadNum(); + if (!thread_num) { + uint32_t cpu_count = std::thread::hardware_concurrency(); + thread_num = cpu_count > 0 ? static_cast(cpu_count) : 1; + } + final_executor = CreateDefaultExecutor(static_cast(thread_num.value())); + } + return std::unique_ptr(new GlobalIndexScanImpl( + table_schema, options, path_factory, std::move(index_metas), final_executor, pool)); } Result> GlobalIndexScanImpl::GetOrCreateIndexEvaluator() { @@ -103,9 +118,8 @@ Result> GlobalIndexScanImpl::GetOrCreateIn return evaluator_; } GlobalIndexEvaluatorImpl::IndexReadersCreator create_index_readers = - [this](int32_t field_id, const std::optional& row_range_index) - -> Result>> { - return CreateReaders(field_id, row_range_index); + [this](int32_t field_id) -> Result>> { + return CreateReaders(field_id, /*row_range_index=*/std::nullopt); }; evaluator_ = std::make_shared(table_schema_, create_index_readers); return evaluator_; @@ -161,9 +175,8 @@ Result>> GlobalIndexScanImpl::Cre if (union_readers.empty()) { continue; } - // TODO(lisizhuo.lsz): add executor in UnionGlobalIndexReader - readers.push_back(std::make_shared(std::move(union_readers), - /*executor=*/nullptr)); + readers.push_back( + std::make_shared(std::move(union_readers), executor_)); } return readers; } @@ -187,11 +200,10 @@ GlobalIndexIOMeta GlobalIndexScanImpl::ToGlobalIndexIOMeta( } Result> GlobalIndexScanImpl::Scan( - const std::shared_ptr& predicate, - const std::optional& row_range_index) { + const std::shared_ptr& predicate) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr evaluator, GetOrCreateIndexEvaluator()); - return evaluator->Evaluate(predicate, row_range_index); + return evaluator->Evaluate(predicate); } } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_scan_impl.h b/src/paimon/core/global_index/global_index_scan_impl.h index c4b0d5f0c..8988d32f4 100644 --- a/src/paimon/core/global_index/global_index_scan_impl.h +++ b/src/paimon/core/global_index/global_index_scan_impl.h @@ -38,17 +38,17 @@ class GlobalIndexScanImpl : public GlobalIndexScan { static Result> Create( const std::string& root_path, const std::shared_ptr& table_schema, const Snapshot& snapshot, const std::shared_ptr& partitions, - const CoreOptions& options, const std::shared_ptr& pool); + const CoreOptions& options, const std::shared_ptr& executor, + const std::shared_ptr& pool); - Result> Scan( - const std::shared_ptr& predicate, - const std::optional& row_range_index); + Result> Scan(const std::shared_ptr& predicate); Result>> CreateReaders( - int32_t field_id, const std::optional& row_range_index) const; + const std::string& field_name, + const std::optional& row_range_index) const override; Result>> CreateReaders( - const std::string& field_name, const std::optional& row_range_index) const; + int32_t field_id, const std::optional& row_range_index) const override; private: /// (id->index_type->row_range) -> index meta list @@ -59,7 +59,8 @@ class GlobalIndexScanImpl : public GlobalIndexScan { GlobalIndexScanImpl(const std::shared_ptr& table_schema, const CoreOptions& options, const std::shared_ptr& path_factory, - IndexMetaMap&& index_metas, const std::shared_ptr& pool); + IndexMetaMap&& index_metas, const std::shared_ptr& executor, + const std::shared_ptr& pool); Result> GetOrCreateIndexEvaluator(); @@ -78,6 +79,7 @@ class GlobalIndexScanImpl : public GlobalIndexScan { CoreOptions options_; std::shared_ptr index_file_manager_; IndexMetaMap index_metas_; + std::shared_ptr executor_; std::shared_ptr evaluator_; }; diff --git a/src/paimon/core/operation/file_store_scan_test.cpp b/src/paimon/core/operation/file_store_scan_test.cpp index 888ee57ff..d2ef0db13 100644 --- a/src/paimon/core/operation/file_store_scan_test.cpp +++ b/src/paimon/core/operation/file_store_scan_test.cpp @@ -129,4 +129,58 @@ TEST_F(FileStoreScanTest, TestCreatePartitionPredicateWithInvalidPartitionFilter "field invalid does not exist in partition keys"); } +TEST_F(FileStoreScanTest, TestFilterManifestByRowRanges) { + class FakeFileStoreScan : public FileStoreScan { + public: + FakeFileStoreScan(const std::shared_ptr& snapshot_manager, + const std::shared_ptr& schema_manager, + const std::shared_ptr& manifest_list, + const std::shared_ptr& manifest_file, + const std::shared_ptr& table_schema, + const std::shared_ptr& schema, + const CoreOptions& core_options, + const std::shared_ptr& executor, + const std::shared_ptr& pool) + : FileStoreScan(snapshot_manager, schema_manager, manifest_list, manifest_file, + table_schema, schema, core_options, executor, pool) {} + Result FilterByStats(const ManifestEntry& entry) const override { + return false; + } + }; + // row id [10, 20] + auto manifest1 = + ManifestFileMeta("manifest-65b0d403-a1bc-4157-b242-bff73c46596d-0", /*file_size=*/2779, + /*num_added_files=*/1, /*num_deleted_files=*/0, SimpleStats::EmptyStats(), + /*schema_id=*/0, /*min_bucket=*/0, /*max_bucket=*/0, + /*min_level=*/0, /*max_level=*/0, + /*min_row_id=*/10, /*max_row_id=*/20); + + ASSERT_OK_AND_ASSIGN(CoreOptions options, CoreOptions::FromMap({{}})); + auto file_store_scan = std::make_shared( + /*snapshot_manager=*/nullptr, /*schema_manager=*/nullptr, /*manifest_list=*/nullptr, + /*manifest_file=*/nullptr, /*table_schema=*/nullptr, /*schema=*/nullptr, options, + /*executor=*/CreateDefaultExecutor(), GetDefaultPool()); + ASSERT_TRUE(file_store_scan->FilterManifestByRowRanges(manifest1)); + + ASSERT_OK_AND_ASSIGN( + RowRangeIndex row_range_index, + RowRangeIndex::Create(std::vector({Range(0, 15), Range(100, 200)}))); + file_store_scan->WithRowRangeIndex(row_range_index); + ASSERT_TRUE(file_store_scan->FilterManifestByRowRanges(manifest1)); + + ASSERT_OK_AND_ASSIGN(row_range_index, + RowRangeIndex::Create(std::vector({Range(0, 5), Range(100, 200)}))); + file_store_scan->WithRowRangeIndex(row_range_index); + ASSERT_FALSE(file_store_scan->FilterManifestByRowRanges(manifest1)); + + auto manifest2 = + ManifestFileMeta("manifest-65b0d403-a1bc-4157-b242-bff73c46596d-0", /*file_size=*/2779, + /*num_added_files=*/1, /*num_deleted_files=*/0, SimpleStats::EmptyStats(), + /*schema_id=*/0, /*min_bucket=*/0, /*max_bucket=*/0, + /*min_level=*/0, /*max_level=*/0, + /*min_row_id=*/std::nullopt, /*max_row_id=*/std::nullopt); + ASSERT_OK_AND_ASSIGN(row_range_index, RowRangeIndex::Create(std::vector({Range(0, 0)}))); + file_store_scan->WithRowRangeIndex(row_range_index); + ASSERT_TRUE(file_store_scan->FilterManifestByRowRanges(manifest2)); +} } // namespace paimon::test diff --git a/src/paimon/core/table/source/data_evolution_batch_scan.cpp b/src/paimon/core/table/source/data_evolution_batch_scan.cpp index 7c617f46a..94310b2e5 100644 --- a/src/paimon/core/table/source/data_evolution_batch_scan.cpp +++ b/src/paimon/core/table/source/data_evolution_batch_scan.cpp @@ -129,13 +129,14 @@ Result> DataEvolutionBatchScan::EvalGlobalInd PAIMON_ASSIGN_OR_RAISE( std::unique_ptr index_scan, GlobalIndexScan::Create(table_path_, core_options_.GetScanSnapshotId(), partition_filter, - core_options_.ToMap(), core_options_.GetFileSystem(), pool_)); + core_options_.ToMap(), core_options_.GetFileSystem(), executor_, + pool_)); auto index_scan_impl = dynamic_cast(index_scan.get()); if (!index_scan_impl) { return Status::Invalid("invalid GlobalIndexScan, cannot cast to GlobalIndexScanImpl"); } - return index_scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt); + return index_scan_impl->Scan(predicate); } } // namespace paimon diff --git a/test/inte/global_index_test.cpp b/test/inte/global_index_test.cpp index 6cdcf8e71..ba43f4bbd 100644 --- a/test/inte/global_index_test.cpp +++ b/test/inte/global_index_test.cpp @@ -13,17 +13,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "arrow/type.h" #include "gtest/gtest.h" #include "paimon/common/factories/io_hook.h" #include "paimon/common/global_index/bitmap/bitmap_global_index_factory.h" +#include "paimon/common/global_index/union_global_index_reader.h" #include "paimon/common/table/special_fields.h" #include "paimon/common/utils/scope_guard.h" #include "paimon/core/global_index/global_index_scan_impl.h" #include "paimon/core/global_index/indexed_split_impl.h" #include "paimon/core/table/source/data_split_impl.h" #include "paimon/defs.h" +#include "paimon/executor.h" #include "paimon/fs/file_system.h" #include "paimon/global_index/bitmap_global_index_result.h" #include "paimon/global_index/bitmap_scored_global_index_result.h" @@ -431,10 +432,10 @@ TEST_P(GlobalIndexTest, TestScanIndex) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; - ASSERT_OK_AND_ASSIGN( - std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); // test index reader // test f0 field ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); @@ -446,9 +447,8 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); { // test with non predicate - ASSERT_OK_AND_ASSIGN( - auto index_result, - global_index_scan_impl->Scan(/*predicate=*/nullptr, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, + global_index_scan_impl->Scan(/*predicate=*/nullptr)); ASSERT_FALSE(index_result); } { @@ -456,8 +456,7 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{0,7}"); } { @@ -465,40 +464,35 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{1,2,3,4,5,6}"); } { // test equal predicate for f1 auto predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{4,6,7}"); } { // test equal predicate for f2 auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(1)); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{0,1,4,5}"); } { // test is null predicate auto predicate = PredicateBuilder::IsNull(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{7}"); } { // test is not null predicate auto predicate = PredicateBuilder::IsNotNull(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{0,1,2,3,4,5,6}"); } { @@ -507,8 +501,7 @@ TEST_P(GlobalIndexTest, TestScanIndex) { /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, {Literal(FieldType::STRING, "Alice", 5), Literal(FieldType::STRING, "Bob", 3), Literal(FieldType::STRING, "Lucy", 4)}); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{0,1,4,5,7}"); } { @@ -517,8 +510,7 @@ TEST_P(GlobalIndexTest, TestScanIndex) { /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, {Literal(FieldType::STRING, "Alice", 5), Literal(FieldType::STRING, "Bob", 3), Literal(FieldType::STRING, "Lucy", 4)}); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{2,3,6}"); } { @@ -529,8 +521,7 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{7}"); } { @@ -541,16 +532,14 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::Or({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{0,4,6,7}"); } { // test non-result auto predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(30)); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{}"); } { @@ -565,49 +554,42 @@ TEST_P(GlobalIndexTest, TestScanIndex) { ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f1_predicate, f2_predicate, f0_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{}"); } { // test greater than predicate which bitmap index is not support, will return all range auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } { // test greater or equal predicate which bitmap index is not support, will return all range auto predicate = PredicateBuilder::GreaterOrEqual(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } { // test less than predicate which bitmap index is not support, will return all range auto predicate = PredicateBuilder::LessThan(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } { // test less or equal predicate which bitmap index is not support, will return all range auto predicate = PredicateBuilder::LessOrEqual(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } { // test a predicate for field with no index auto f3_predicate = PredicateBuilder::Equal(/*field_index=*/3, /*field_name=*/"f3", FieldType::DOUBLE, Literal(1.2)); - ASSERT_OK_AND_ASSIGN( - auto index_result, - global_index_scan_impl->Scan(f3_predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(f3_predicate)); ASSERT_FALSE(index_result); } } @@ -620,10 +602,10 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshot) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; // snapshot 2 has f0 index - ASSERT_OK_AND_ASSIGN( - std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/2l, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/2l, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); // test index reader // test f0 field ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); @@ -633,7 +615,7 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshot) { ASSERT_EQ(index_result->ToString(), "{0,7}"); // test f1 field ASSERT_OK_AND_ASSIGN(auto index_readers2, global_index_scan->CreateReaders("f1", std::nullopt)); - ASSERT_EQ(index_readers.size(), 1u); + ASSERT_EQ(index_readers2.size(), 0u); auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); @@ -645,8 +627,7 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshot) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{0,7}"); } { @@ -657,8 +638,7 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshot) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::Or({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan( - predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } } @@ -671,10 +651,10 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshotWithNoIndex) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; // snapshot 1 has no index - ASSERT_OK_AND_ASSIGN( - std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/1l, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/1l, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); // test index reader ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); ASSERT_EQ(index_readers.size(), 0u); @@ -684,8 +664,7 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshotWithNoIndex) { auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, - global_index_scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } @@ -696,10 +675,10 @@ TEST_P(GlobalIndexTest, TestScanIndexWithRange) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; - ASSERT_OK_AND_ASSIGN( - std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); { // test index reader @@ -713,9 +692,7 @@ TEST_P(GlobalIndexTest, TestScanIndexWithRange) { auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN( - auto evaluator_result, - global_index_scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto evaluator_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(evaluator_result->ToString(), "{1,2,3,4,5,6}"); } { @@ -724,13 +701,6 @@ TEST_P(GlobalIndexTest, TestScanIndexWithRange) { ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", row_range_index)); ASSERT_EQ(index_readers.size(), 0u); - - auto predicate = - PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, - global_index_scan_impl->Scan(predicate, row_range_index)); - ASSERT_FALSE(index_result); } } @@ -745,9 +715,10 @@ TEST_P(GlobalIndexTest, TestScanIndexWithPartition) { "/append_with_global_index_with_partition.db/append_with_global_index_with_partition"; auto check_result = [&](const std::optional>>& partitions) { - ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - partitions, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, partitions, + /*options=*/{}, fs_, /*executor=*/nullptr, pool_)); // test index reader ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, RowRangeIndex::Create({Range(0, 4)})); @@ -766,26 +737,19 @@ TEST_P(GlobalIndexTest, TestScanIndexWithPartition) { auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(1)); - ASSERT_OK_AND_ASSIGN(auto index_result, - global_index_scan_impl->Scan(predicate, row_range_index)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } { // test not equal predicate for Bob - auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", - FieldType::STRING, - Literal(FieldType::STRING, "Bob", 3)); - ASSERT_OK_AND_ASSIGN(auto index_result, - global_index_scan_impl->Scan(predicate, row_range_index)); + ASSERT_OK_AND_ASSIGN(auto index_result, index_readers[0]->VisitNotEqual( + Literal(FieldType::STRING, "Bob", 3))); ASSERT_EQ(index_result->ToString(), "{0,2,3}"); } { // test equal predicate for Alice - auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", - FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, - global_index_scan_impl->Scan(predicate, row_range_index)); + ASSERT_OK_AND_ASSIGN(auto index_result, index_readers[0]->VisitEqual(Literal( + FieldType::STRING, "Alice", 5))); ASSERT_EQ(index_result->ToString(), "{0}"); } }; @@ -807,10 +771,10 @@ TEST_P(GlobalIndexTest, TestScanUnregisteredIndex) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; - ASSERT_OK_AND_ASSIGN( - std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); ASSERT_EQ(index_readers.size(), 0u); @@ -819,8 +783,7 @@ TEST_P(GlobalIndexTest, TestScanUnregisteredIndex) { PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Bob", 3)); - ASSERT_OK_AND_ASSIGN(auto index_result, - global_index_scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } @@ -848,10 +811,10 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndex) { ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", /*options=*/{}, Range(0, 7))); - ASSERT_OK_AND_ASSIGN( - auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(auto global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); ASSERT_EQ(index_readers.size(), 1u); ASSERT_OK_AND_ASSIGN(auto index_result, @@ -924,22 +887,18 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { const std::shared_ptr& expected_array, const std::map& id_to_score) { std::vector> partitions = {partition}; - ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - partitions, lumina_options, fs_, pool_)); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, partitions, + lumina_options, fs_, /*executor=*/nullptr, pool_)); // check bitmap index - auto scanner_impl = std::dynamic_pointer_cast(global_index_scan); - - auto predicate1 = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - auto predicate2 = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Paul", 4)); - ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::Or({predicate1, predicate2})); - - ASSERT_OK_AND_ASSIGN(auto index_result, scanner_impl->Scan(predicate, row_range_index)); - ASSERT_TRUE(index_result); + ASSERT_OK_AND_ASSIGN(auto readers, global_index_scan->CreateReaders("f0", row_range_index)); + ASSERT_EQ(readers.size(), 1u); + ASSERT_OK_AND_ASSIGN(auto result1, + readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_OK_AND_ASSIGN(auto result2, + readers[0]->VisitEqual(Literal(FieldType::STRING, "Paul", 4))); + ASSERT_OK_AND_ASSIGN(auto index_result, result1->Or(result2)); ASSERT_EQ(index_result->ToString(), bitmap_result); // check lumina index @@ -1000,7 +959,7 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { GlobalIndexScan::Create( table_path, /*snapshot_id=*/std::nullopt, /*partitions=*/std::vector>(), lumina_options, - fs_, pool_), + fs_, /*executor=*/nullptr, pool_), "invalid input partition, supposed to be null or at least one partition"); } } @@ -1624,10 +1583,11 @@ TEST_P(GlobalIndexTest, TestScanIndexWithTwoIndexes) { ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f1", "lumina", /*options=*/lumina_options, Range(0, 8))); - ASSERT_OK_AND_ASSIGN(auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, - /*options=*/lumina_options, fs_, pool_)); + ASSERT_OK_AND_ASSIGN( + auto global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, + /*options=*/lumina_options, fs_, /*executor=*/nullptr, pool_)); // query f0 ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); ASSERT_EQ(index_readers.size(), 1); @@ -2078,13 +2038,13 @@ TEST_P(GlobalIndexTest, TestLuceneWriteCommitScanReadIndexWithScore) { ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "lucene-fts", /*options=*/lucene_options, Range(0, 3))); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); // test f0 field ASSERT_OK_AND_ASSIGN(auto index_readers, - global_index_scan->CreateReaders("f0", /*row_rangw_index=*/std::nullopt)); + global_index_scan->CreateReaders("f0", /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_readers.size(), 1u); auto index_reader = index_readers[0]; { @@ -2145,10 +2105,10 @@ TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndex) { ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "btree", /*options=*/{}, Range(0, 7))); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", /*row_range_index=*/std::nullopt)); ASSERT_EQ(index_readers.size(), 1u); @@ -2248,8 +2208,7 @@ TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndex) { auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Tony", 4)); - ASSERT_OK_AND_ASSIGN(auto result, - scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto result, scan_impl->Scan(predicate)); ASSERT_TRUE(result); ASSERT_EQ(result->ToString(), "{5,6}"); } @@ -2263,8 +2222,7 @@ TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndex) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto result, - scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto result, scan_impl->Scan(predicate)); ASSERT_TRUE(result); ASSERT_EQ(result->ToString(), "{1,2}"); } @@ -2294,8 +2252,7 @@ TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndex) { PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Bob", 3)); auto scan_impl = std::dynamic_pointer_cast(global_index_scan); - ASSERT_OK_AND_ASSIGN(auto index_result, - scan_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, scan_impl->Scan(predicate)); ASSERT_TRUE(index_result); ASSERT_EQ(index_result->ToString(), "{1,2}"); @@ -2356,10 +2313,10 @@ TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndexWithPartition) { // Scan all partitions { - ASSERT_OK_AND_ASSIGN( - std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, + fs_, /*executor=*/nullptr, pool_)); ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); // One reader per partition range -> 2 ranges -> UnionGlobalIndexReader wraps them @@ -2391,7 +2348,7 @@ TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndexWithPartition) { ASSERT_OK_AND_ASSIGN( std::shared_ptr global_index_scan, GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, partitions, - /*options=*/{}, fs_, pool_)); + /*options=*/{}, fs_, /*executor=*/nullptr, pool_)); ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); ASSERT_EQ(index_readers.size(), 1u); @@ -2411,10 +2368,10 @@ TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndexWithPartition) { // Scan with row_range_index filtering: only range [5,7] (partition f1=20) { - ASSERT_OK_AND_ASSIGN( - std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, + fs_, /*executor=*/nullptr, pool_)); ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, RowRangeIndex::Create({Range(5, 7)})); ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", row_range_index)); @@ -2429,18 +2386,17 @@ TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndexWithPartition) { // Full pipeline with evaluator: Scan(predicate) -> read data { - ASSERT_OK_AND_ASSIGN( - std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, + fs_, /*executor=*/nullptr, pool_)); auto scanner_impl = std::dynamic_pointer_cast(global_index_scan); ASSERT_TRUE(scanner_impl); auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Tony", 4)); - ASSERT_OK_AND_ASSIGN(auto index_result, - scanner_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, scanner_impl->Scan(predicate)); ASSERT_TRUE(index_result); ASSERT_EQ(index_result->ToString(), "{4,7}"); @@ -2458,6 +2414,105 @@ TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndexWithPartition) { } } +TEST_P(GlobalIndexTest, TestBTreeWithPartitionAndCustomExecutor) { + // Test that UnionGlobalIndexReader uses a custom 8-thread executor to read + // btree indexes from two partitions in parallel. + auto schema = arrow::schema(fields_); + std::map options = {{Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, file_format_}, + {Options::FILE_SYSTEM, "local"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}}; + CreateTable(/*partition_keys=*/{"f1"}, schema, options); + + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + std::vector write_cols = schema->field_names(); + + // Write partition f1=10 (5 rows, sorted by f0) + auto src_array1 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ +["Alice", 10, 1, 11.1], +["Bob", 10, 1, 12.1], +["Bob", 10, 0, 13.1], +["Emily", 10, 0, 14.1], +["Tony", 10, 1, 15.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto commit_msgs1, + WriteArray(table_path, {{"f1", "10"}}, write_cols, src_array1)); + ASSERT_OK(Commit(table_path, commit_msgs1)); + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{{{"f1", "10"}}}, "f0", "btree", + /*options=*/{}, Range(0, 4))); + + // Write partition f1=20 (3 rows, sorted by f0) + auto src_array2 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ +["Alice", 20, null, 16.1], +["Lucy", 20, 1, 17.1], +["Tony", 20, 0, 18.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto commit_msgs2, + WriteArray(table_path, {{"f1", "20"}}, write_cols, src_array2)); + ASSERT_OK(Commit(table_path, commit_msgs2)); + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{{{"f1", "20"}}}, "f0", "btree", + /*options=*/{}, Range(5, 7))); + + // Create a GlobalIndexScan with an explicit 8-thread executor + std::shared_ptr executor = CreateDefaultExecutor(/*thread_count=*/8); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, executor, pool_)); + + // CreateReaders should return 1 UnionGlobalIndexReader (2 sub-readers for 2 ranges) + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + + auto union_reader = std::dynamic_pointer_cast(index_readers[0]); + ASSERT_TRUE(union_reader); + ASSERT_EQ(union_reader->executor_, executor); + + // "Alice" in both partitions: global ids {0, 5} + ASSERT_OK_AND_ASSIGN(auto result, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,5}"); + + // "Bob" only in f1=10: global ids {1, 2} + ASSERT_OK_AND_ASSIGN(auto result2, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Bob", 3))); + ASSERT_TRUE(result2); + ASSERT_EQ(result2->ToString(), "{1,2}"); + + // "Lucy" only in f1=20: global id {6} + ASSERT_OK_AND_ASSIGN(auto result3, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Lucy", 4))); + ASSERT_TRUE(result3); + ASSERT_EQ(result3->ToString(), "{6}"); + + // Full pipeline: evaluator with the 8-thread executor + auto scanner_impl = std::dynamic_pointer_cast(global_index_scan); + ASSERT_TRUE(scanner_impl); + + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Tony", 4)); + ASSERT_OK_AND_ASSIGN(auto index_result, scanner_impl->Scan(predicate)); + ASSERT_TRUE(index_result); + ASSERT_EQ(index_result->ToString(), "{4,7}"); + + auto result_fields = fields_; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Tony", 10, 1, 15.1], +[0, "Tony", 20, 0, 18.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, + /*options=*/{}, index_result)); + ASSERT_OK(ReadData(table_path, write_cols, expected_array, /*predicate=*/nullptr, plan)); +} + TEST_P(GlobalIndexTest, TestBTreeAndBitmapCoexist) { // Test btree-global and bitmap index coexisting on the same field (f0). // The evaluator should AND their results, producing the intersection. @@ -2466,7 +2521,7 @@ TEST_P(GlobalIndexTest, TestBTreeAndBitmapCoexist) { auto schema = arrow::schema(fields_); std::vector write_cols = schema->field_names(); - // Data sorted by f0 for btree: Alice < Bob < Bob < Emily < Lucy < Tony < Tony < null + // Data sorted by f0 for btree: Alice < Bob < Bob < Emily < Lucy < Tony < Tony auto src_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ ["Alice", 10, 1, 11.1], ["Bob", 10, 1, 12.1], @@ -2488,19 +2543,19 @@ TEST_P(GlobalIndexTest, TestBTreeAndBitmapCoexist) { ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", /*options=*/{}, Range(0, 7))); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); // Two index types on f0 -> 2 readers ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); ASSERT_EQ(index_readers.size(), 2u); // Each reader individually should return the same result for Equal("Bob") - for (size_t i = 0; i < index_readers.size(); i++) { + for (const auto& index_reader : index_readers) { ASSERT_OK_AND_ASSIGN(auto result, - index_readers[i]->VisitEqual(Literal(FieldType::STRING, "Bob", 3))); + index_reader->VisitEqual(Literal(FieldType::STRING, "Bob", 3))); ASSERT_TRUE(result); ASSERT_EQ(result->ToString(), "{1,2}"); } @@ -2514,8 +2569,7 @@ TEST_P(GlobalIndexTest, TestBTreeAndBitmapCoexist) { auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Bob", 3)); - ASSERT_OK_AND_ASSIGN(auto result, - evaluator->Evaluate(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto result, evaluator->Evaluate(predicate)); ASSERT_TRUE(result); ASSERT_EQ(result->ToString(), "{1,2}"); } @@ -2524,8 +2578,7 @@ TEST_P(GlobalIndexTest, TestBTreeAndBitmapCoexist) { auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Bob", 3)); - ASSERT_OK_AND_ASSIGN(auto result, - evaluator->Evaluate(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto result, evaluator->Evaluate(predicate)); ASSERT_TRUE(result); ASSERT_EQ(result->ToString(), "{0,3,4,5,6}"); } @@ -2533,8 +2586,7 @@ TEST_P(GlobalIndexTest, TestBTreeAndBitmapCoexist) { // IsNull: both agree on row 7 auto predicate = PredicateBuilder::IsNull(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING); - ASSERT_OK_AND_ASSIGN(auto result, - evaluator->Evaluate(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto result, evaluator->Evaluate(predicate)); ASSERT_TRUE(result); ASSERT_EQ(result->ToString(), "{7}"); } @@ -2544,8 +2596,7 @@ TEST_P(GlobalIndexTest, TestBTreeAndBitmapCoexist) { auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, - scanner_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, scanner_impl->Scan(predicate)); ASSERT_TRUE(index_result); ASSERT_EQ(index_result->ToString(), "{0}"); @@ -2571,8 +2622,7 @@ TEST_P(GlobalIndexTest, TestBTreeAndBitmapCoexist) { auto f1_pred = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f0_pred, f1_pred})); - ASSERT_OK_AND_ASSIGN(auto index_result, - scanner_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, scanner_impl->Scan(predicate)); ASSERT_TRUE(index_result); ASSERT_EQ(index_result->ToString(), "{1,2}"); @@ -2598,10 +2648,10 @@ TEST_P(GlobalIndexTest, TestBTreeScanWithPartitionWithMultiMeta) { paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_btree_with_partition.db/append_with_btree_with_partition"; - ASSERT_OK_AND_ASSIGN( - std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); auto count_rows = [](const std::shared_ptr& result) -> int64_t { EXPECT_TRUE(result); @@ -2716,9 +2766,8 @@ TEST_P(GlobalIndexTest, TestBTreeScanWithPartitionWithMultiMeta) { auto eq_present, reader->VisitEqual(Literal(Decimal::FromUnscaledLong(5 * 123456L, 18, 6)))); ASSERT_EQ(count_rows(eq_present), 2); - ASSERT_OK_AND_ASSIGN( - auto eq_missing, - reader->VisitEqual(Literal(Decimal::FromUnscaledLong(/*unscaled=*/1L, 18, 6)))); + ASSERT_OK_AND_ASSIGN(auto eq_missing, + reader->VisitEqual(Literal(Decimal::FromUnscaledLong(1L, 18, 6)))); ASSERT_EQ(count_rows(eq_missing), 0); // GreaterThan(i=10): i in [11, 19] -> 18 rows globally. ASSERT_OK_AND_ASSIGN( @@ -2795,10 +2844,11 @@ TEST_P(GlobalIndexTest, TestBTreeWithLumina) { ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f1", "lumina", /*options=*/lumina_options, Range(0, 7))); - ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, - /*options=*/lumina_options, fs_, pool_)); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, + /*options=*/lumina_options, fs_, /*executor=*/nullptr, pool_)); // Query f0 via btree { @@ -2839,8 +2889,7 @@ TEST_P(GlobalIndexTest, TestBTreeWithLumina) { auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Bob", 3)); - ASSERT_OK_AND_ASSIGN(auto index_result, - scanner_impl->Scan(predicate, /*row_range_index=*/std::nullopt)); + ASSERT_OK_AND_ASSIGN(auto index_result, scanner_impl->Scan(predicate)); ASSERT_TRUE(index_result); ASSERT_EQ(index_result->ToString(), "{2,3}");