diff --git a/regression-test/data/ann_index_p0/ivf_index_test.out b/regression-test/data/ann_index_p0/ivf_index_test.out index 91c7483c806fd6..3e484c0b696ca4 100644 --- a/regression-test/data/ann_index_p0/ivf_index_test.out +++ b/regression-test/data/ann_index_p0/ivf_index_test.out @@ -7,6 +7,14 @@ 5 [50, 20, 20] 6 [60, 20, 20] +-- !sql_l2_topn -- +1 +2 + +-- !sql_l2_insufficient_train_rows -- +1 +2 + -- !sql -- 1 [1, 2, 3] 2 [0.5, 2.1, 2.9] @@ -15,3 +23,7 @@ 5 [50, 20, 20] 6 [60, 20, 20] +-- !sql_ip_topn -- +6 +5 + diff --git a/regression-test/data/ann_index_p0/ivf_on_disk_index_test.out b/regression-test/data/ann_index_p0/ivf_on_disk_index_test.out index ffeeb85e1335b3..bcd94f4ac52341 100644 --- a/regression-test/data/ann_index_p0/ivf_on_disk_index_test.out +++ b/regression-test/data/ann_index_p0/ivf_on_disk_index_test.out @@ -7,6 +7,10 @@ 5 [50, 20, 20] 6 [60, 20, 20] +-- !sql_l2_topn -- +1 +2 + -- !sql -- 1 [1, 2, 3] 2 [0.5, 2.1, 2.9] @@ -15,6 +19,22 @@ 5 [50, 20, 20] 6 [60, 20, 20] +-- !sql_ip_topn -- +6 +5 + +-- !sql_stream_load_rows -- +1 [1, 2, 3] +2 [0.5, 2.1, 2.9] +3 [10, 10, 10] +4 [20, 20, 20] +5 [50, 20, 20] +6 [60, 20, 20] + +-- !sql_stream_load_topn -- +1 +2 + -- !sql -- 1 [1, 2, 3] 2 [0.5, 2.1, 2.9] @@ -27,3 +47,13 @@ 9 [0, 0, 0] 10 [30, 30, 30] +-- !sql_large_topn -- +1 +2 +9 + +-- !sql_range_search -- +1 +2 +3 + diff --git a/regression-test/suites/ann_index_p0/ann_index_basic.groovy b/regression-test/suites/ann_index_p0/ann_index_basic.groovy index b64e795c993aea..41ae9e5c309864 100644 --- a/regression-test/suites/ann_index_p0/ann_index_basic.groovy +++ b/regression-test/suites/ann_index_p0/ann_index_basic.groovy @@ -22,9 +22,9 @@ suite ("ann_index_basic") { sql "set enable_common_expr_pushdown=true;" // 1) Basic L2 ANN table: dim=3 - sql "drop table if exists tbl_ann_l2" + sql "drop table if exists basic_tbl_ann_l2" sql """ - CREATE TABLE tbl_ann_l2 ( + CREATE TABLE basic_tbl_ann_l2 ( id INT NOT NULL, embedding ARRAY NOT NULL, INDEX idx_emb (`embedding`) USING ANN PROPERTIES( @@ -39,19 +39,19 @@ suite ("ann_index_basic") { """ qt_sql_l2_insert """ - INSERT INTO tbl_ann_l2 VALUES + INSERT INTO basic_tbl_ann_l2 VALUES (1, [1.0, 2.0, 3.0]), (2, [0.5, 2.1, 2.9]), (3, [10.0, 10.0, 10.0]); """ // Query: l2 distance ascending (closest first) - qt_sql_l2_query "select id, l2_distance_approximate(embedding, [1.0,2.0,3.0]) as dist from tbl_ann_l2 order by dist limit 3;" + qt_sql_l2_query "select id, l2_distance_approximate(embedding, [1.0,2.0,3.0]) as dist from basic_tbl_ann_l2 order by dist limit 3;" // 2) Basic inner_product ANN table: dim=4 - sql "drop table if exists tbl_ann_ip" + sql "drop table if exists basic_tbl_ann_ip" sql """ - CREATE TABLE tbl_ann_ip ( + CREATE TABLE basic_tbl_ann_ip ( id INT NOT NULL, embedding ARRAY NOT NULL, INDEX idx_emb (`embedding`) USING ANN PROPERTIES( @@ -66,23 +66,23 @@ suite ("ann_index_basic") { """ qt_sql_ip_insert """ - INSERT INTO tbl_ann_ip VALUES + INSERT INTO basic_tbl_ann_ip VALUES (1, [0.1, 0.2, 0.3, 0.4]), (2, [0.5, 0.6, 0.7, 0.8]), (3, [1.0, 1.0, 1.0, 1.0]); """ // Query: inner product descending (higher score first) - qt_sql_ip_query "select id from tbl_ann_ip order by inner_product_approximate(embedding, [0.1,0.2,0.3,0.4]) desc limit 3;" + qt_sql_ip_query "select id from basic_tbl_ann_ip order by inner_product_approximate(embedding, [0.1,0.2,0.3,0.4]) desc limit 3;" // 3) Simple threshold filter using l2_distance_approximate - qt_sql_l2_threshold "select id from tbl_ann_l2 where l2_distance_approximate(embedding, [1.0,2.0,3.0]) < 5.0 order by id;" + qt_sql_l2_threshold "select id from basic_tbl_ann_l2 where l2_distance_approximate(embedding, [1.0,2.0,3.0]) < 5.0 order by id;" // 4) Descending l2 order (should exercise path where Desc topn for l2/cosine cannot be evaluated by ann index) - qt_sql_l2_desc "select id from tbl_ann_l2 order by l2_distance_approximate(embedding, [1.0,2.0,3.0]) desc limit 2;" + qt_sql_l2_desc "select id from basic_tbl_ann_l2 order by l2_distance_approximate(embedding, [1.0,2.0,3.0]) desc limit 2;" // 5) Ascending inner_product order (should exercise path where Asc topn for inner product cannot be evaluated by ann index) - qt_sql_ip_asc "select id from tbl_ann_ip order by inner_product_approximate(embedding, [0.1,0.2,0.3,0.4]) asc limit 2;" + qt_sql_ip_asc "select id from basic_tbl_ann_ip order by inner_product_approximate(embedding, [0.1,0.2,0.3,0.4]) asc limit 2;" // 6) Large table to exercise predicate-input-ratio check (create many rows and run topn with small-range predicate) sql "drop table if exists tbl_ann_l2_large" diff --git a/regression-test/suites/ann_index_p0/ivf_index_test.groovy b/regression-test/suites/ann_index_p0/ivf_index_test.groovy index c806eed306885c..58eda2f2fce829 100644 --- a/regression-test/suites/ann_index_p0/ivf_index_test.groovy +++ b/regression-test/suites/ann_index_p0/ivf_index_test.groovy @@ -19,9 +19,9 @@ suite ("ivf_index_test") { sql "set enable_common_expr_pushdown=true;" // IVF index - sql "drop table if exists tbl_ann_l2" + sql "drop table if exists ivf_tbl_ann_l2" sql """ - CREATE TABLE tbl_ann_l2 ( + CREATE TABLE ivf_tbl_ann_l2 ( id INT NOT NULL, embedding ARRAY NOT NULL, INDEX idx_emb (`embedding`) USING ANN PROPERTIES( @@ -37,7 +37,7 @@ suite ("ivf_index_test") { """ sql """ - INSERT INTO tbl_ann_l2 VALUES + INSERT INTO ivf_tbl_ann_l2 VALUES (1, [1.0, 2.0, 3.0]), (2, [0.5, 2.1, 2.9]), (3, [10.0, 10.0, 10.0]), @@ -45,15 +45,14 @@ suite ("ivf_index_test") { (5, [50.0, 20.0, 20.0]), (6, [60.0, 20.0, 20.0]); """ - qt_sql "select * from tbl_ann_l2;" - // just approximate search - sql "select id, l2_distance_approximate(embedding, [1.0,2.0,3.0]) as dist from tbl_ann_l2 order by dist limit 2;" + qt_sql "select * from ivf_tbl_ann_l2 order by id;" + qt_sql_l2_topn "select id from ivf_tbl_ann_l2 order by l2_distance_approximate(embedding, [1.0,2.0,3.0]) limit 2;" - sql """drop table if exists tbl_ann_l2""" + sql """drop table if exists ivf_tbl_ann_l2""" test { // missing nlist sql """ - CREATE TABLE tbl_ann_l2 ( + CREATE TABLE ivf_tbl_ann_l2 ( id INT NOT NULL, embedding ARRAY NOT NULL, INDEX idx_emb (`embedding`) USING ANN PROPERTIES( @@ -70,7 +69,7 @@ suite ("ivf_index_test") { } sql """ - CREATE TABLE tbl_ann_l2 ( + CREATE TABLE ivf_tbl_ann_l2 ( id INT NOT NULL, embedding ARRAY NOT NULL, INDEX idx_emb (`embedding`) USING ANN PROPERTIES( @@ -86,14 +85,15 @@ suite ("ivf_index_test") { """ // Not enough training points: should not throw exception anymore, just skip index building. sql """ - INSERT INTO tbl_ann_l2 VALUES + INSERT INTO ivf_tbl_ann_l2 VALUES (1, [1.0, 2.0, 3.0]), (2, [0.5, 2.1, 2.9]); """ + qt_sql_l2_insufficient_train_rows "select id from ivf_tbl_ann_l2 order by l2_distance_approximate(embedding, [1.0,2.0,3.0]) limit 2;" - sql "drop table if exists tbl_ann_ip" + sql "drop table if exists ivf_tbl_ann_ip" sql """ - CREATE TABLE tbl_ann_ip ( + CREATE TABLE ivf_tbl_ann_ip ( id INT NOT NULL, embedding ARRAY NOT NULL, INDEX idx_emb (`embedding`) USING ANN PROPERTIES( @@ -109,7 +109,7 @@ suite ("ivf_index_test") { """ sql """ - INSERT INTO tbl_ann_ip VALUES + INSERT INTO ivf_tbl_ann_ip VALUES (1, [1.0, 2.0, 3.0]), (2, [0.5, 2.1, 2.9]), (3, [10.0, 10.0, 10.0]), @@ -117,7 +117,6 @@ suite ("ivf_index_test") { (5, [50.0, 20.0, 20.0]), (6, [60.0, 20.0, 20.0]); """ - qt_sql "select * from tbl_ann_ip;" - // just approximate search - sql "select id, inner_product_approximate(embedding, [1.0,2.0,3.0]) as dist from tbl_ann_ip order by dist desc limit 2;" + qt_sql "select * from ivf_tbl_ann_ip order by id;" + qt_sql_ip_topn "select id from ivf_tbl_ann_ip order by inner_product_approximate(embedding, [1.0,2.0,3.0]) desc limit 2;" } diff --git a/regression-test/suites/ann_index_p0/ivf_on_disk_index_test.groovy b/regression-test/suites/ann_index_p0/ivf_on_disk_index_test.groovy index ac77fa1c60b5f7..a9eed51d7a4125 100644 --- a/regression-test/suites/ann_index_p0/ivf_on_disk_index_test.groovy +++ b/regression-test/suites/ann_index_p0/ivf_on_disk_index_test.groovy @@ -45,9 +45,8 @@ suite ("ivf_on_disk_index_test") { (5, [50.0, 20.0, 20.0]), (6, [60.0, 20.0, 20.0]); """ - qt_sql "select * from tbl_ivf_on_disk_l2;" - // approximate search with l2_distance - sql "select id, l2_distance_approximate(embedding, [1.0,2.0,3.0]) as dist from tbl_ivf_on_disk_l2 order by dist limit 2;" + qt_sql "select * from tbl_ivf_on_disk_l2 order by id;" + qt_sql_l2_topn "select id from tbl_ivf_on_disk_l2 order by l2_distance_approximate(embedding, [1.0,2.0,3.0]) limit 2;" // ========== Error: missing nlist for ivf_on_disk ========== sql "drop table if exists tbl_ivf_on_disk_l2" @@ -121,9 +120,8 @@ suite ("ivf_on_disk_index_test") { (5, [50.0, 20.0, 20.0]), (6, [60.0, 20.0, 20.0]); """ - qt_sql "select * from tbl_ivf_on_disk_ip;" - // approximate search with inner_product - sql "select id, inner_product_approximate(embedding, [1.0,2.0,3.0]) as dist from tbl_ivf_on_disk_ip order by dist desc limit 2;" + qt_sql "select * from tbl_ivf_on_disk_ip order by id;" + qt_sql_ip_topn "select id from tbl_ivf_on_disk_ip order by inner_product_approximate(embedding, [1.0,2.0,3.0]) desc limit 2;" // ========== IVF_ON_DISK with stream load ========== sql "drop table if exists tbl_ivf_on_disk_stream_load" @@ -163,6 +161,8 @@ suite ("ivf_on_disk_index_test") { assertEquals(0, json.NumberFilteredRows) } } + qt_sql_stream_load_rows "select * from tbl_ivf_on_disk_stream_load order by id;" + qt_sql_stream_load_topn "select id from tbl_ivf_on_disk_stream_load order by l2_distance_approximate(embedding, [1.0,2.0,3.0]) limit 2;" // ========== IVF_ON_DISK with larger dataset (more rows than nlist) ========== sql "drop table if exists tbl_ivf_on_disk_large" @@ -196,8 +196,7 @@ suite ("ivf_on_disk_index_test") { (10, [30.0, 30.0, 30.0]); """ qt_sql "select * from tbl_ivf_on_disk_large order by id;" - // approximate search on larger dataset - sql "select id, l2_distance_approximate(embedding, [1.0,2.0,3.0]) as dist from tbl_ivf_on_disk_large order by dist limit 3;" + qt_sql_large_topn "select id from tbl_ivf_on_disk_large order by l2_distance_approximate(embedding, [1.0,2.0,3.0]) limit 3;" // ========== IVF_ON_DISK range search with l2_distance ========== sql "drop table if exists tbl_ivf_on_disk_range" @@ -226,6 +225,5 @@ suite ("ivf_on_disk_index_test") { (5, [50.0, 20.0, 20.0]), (6, [60.0, 20.0, 20.0]); """ - // range search: find vectors within distance threshold - sql "select id from tbl_ivf_on_disk_range where l2_distance_approximate(embedding, [1.0, 2.0, 3.0]) < 20.0 order by id;" + qt_sql_range_search "select id from tbl_ivf_on_disk_range where l2_distance_approximate(embedding, [1.0, 2.0, 3.0]) < 20.0 order by id;" }