[SPARK-27988][SQL][TEST] Port AGGREGATES.sql [Part 3]

## What changes were proposed in this pull request? This PR is to port AGGREGATES.sql from PostgreSQL regression tests. https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/aggregates.sql#L352-L605 The expected results can be found in the link: https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/expected/aggregates.out#L986-L1613 When porting the test cases, found seven PostgreSQL specific features that do not exist in Spark SQL: [SPARK-27974](https://issues.apache.org/jira/browse/SPARK-27974): Add built-in Aggregate Function: array_agg [SPARK-27978](https://issues.apache.org/jira/browse/SPARK-27978): Add built-in Aggregate Functions: string_agg [SPARK-27986](https://issues.apache.org/jira/browse/SPARK-27986): Support Aggregate Expressions with filter [SPARK-27987](https://issues.apache.org/jira/browse/SPARK-27987): Support POSIX Regular Expressions [SPARK-28682](https://issues.apache.org/jira/browse/SPARK-28682): ANSI SQL: Collation Support [SPARK-28768](https://issues.apache.org/jira/browse/SPARK-28768): Implement more text pattern operators [SPARK-28865](https://issues.apache.org/jira/browse/SPARK-28865): Table inheritance ## How was this patch tested? N/A Closes #24829 from wangyum/SPARK-27988. Authored-by: Yuming Wang <yumwang@ebay.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
apache · Aug 25, 2019 · 4b16cf1 · 4b16cf1
1 parent 02a0cde
commit 4b16cf1
Show file tree

Hide file tree

Showing 2 changed files with 292 additions and 0 deletions.
diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part3.sql b/sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part3.sql
@@ -0,0 +1,270 @@
+--
+-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+--
+--
+-- AGGREGATES [Part 3]
+-- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/aggregates.sql#L352-L605
+
+-- [SPARK-28865] Table inheritance
+-- try it on an inheritance tree
+-- create table minmaxtest(f1 int);
+-- create table minmaxtest1() inherits (minmaxtest);
+-- create table minmaxtest2() inherits (minmaxtest);
+-- create table minmaxtest3() inherits (minmaxtest);
+-- create index minmaxtesti on minmaxtest(f1);
+-- create index minmaxtest1i on minmaxtest1(f1);
+-- create index minmaxtest2i on minmaxtest2(f1 desc);
+-- create index minmaxtest3i on minmaxtest3(f1) where f1 is not null;
+
+-- insert into minmaxtest values(11), (12);
+-- insert into minmaxtest1 values(13), (14);
+-- insert into minmaxtest2 values(15), (16);
+-- insert into minmaxtest3 values(17), (18);
+
+-- explain (costs off)
+--   select min(f1), max(f1) from minmaxtest;
+-- select min(f1), max(f1) from minmaxtest;
+
+-- DISTINCT doesn't do anything useful here, but it shouldn't fail
+-- explain (costs off)
+--   select distinct min(f1), max(f1) from minmaxtest;
+-- select distinct min(f1), max(f1) from minmaxtest;
+
+-- drop table minmaxtest cascade;
+
+-- check for correct detection of nested-aggregate errors
+select max(min(unique1)) from tenk1;
+-- select (select max(min(unique1)) from int8_tbl) from tenk1;
+
+-- These tests only test the explain. Skip these tests.
+--
+-- Test removal of redundant GROUP BY columns
+--
+
+-- create temp table t1 (a int, b int, c int, d int, primary key (a, b));
+-- create temp table t2 (x int, y int, z int, primary key (x, y));
+-- create temp table t3 (a int, b int, c int, primary key(a, b) deferrable);
+
+-- Non-primary-key columns can be removed from GROUP BY
+-- explain (costs off) select * from t1 group by a,b,c,d;
+
+-- No removal can happen if the complete PK is not present in GROUP BY
+-- explain (costs off) select a,c from t1 group by a,c,d;
+
+-- Test removal across multiple relations
+-- explain (costs off) select *
+-- from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
+-- group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z;
+
+-- Test case where t1 can be optimized but not t2
+-- explain (costs off) select t1.*,t2.x,t2.z
+-- from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
+-- group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z;
+
+-- Cannot optimize when PK is deferrable
+-- explain (costs off) select * from t3 group by a,b,c;
+
+-- drop table t1;
+-- drop table t2;
+-- drop table t3;
+
+-- [SPARK-27974] Add built-in Aggregate Function: array_agg
+--
+-- Test combinations of DISTINCT and/or ORDER BY
+--
+
+-- select array_agg(a order by b)
+--   from (values (1,4),(2,3),(3,1),(4,2)) v(a,b);
+-- select array_agg(a order by a)
+--   from (values (1,4),(2,3),(3,1),(4,2)) v(a,b);
+-- select array_agg(a order by a desc)
+--   from (values (1,4),(2,3),(3,1),(4,2)) v(a,b);
+-- select array_agg(b order by a desc)
+--   from (values (1,4),(2,3),(3,1),(4,2)) v(a,b);
+
+-- select array_agg(distinct a)
+--   from (values (1),(2),(1),(3),(null),(2)) v(a);
+-- select array_agg(distinct a order by a)
+--   from (values (1),(2),(1),(3),(null),(2)) v(a);
+-- select array_agg(distinct a order by a desc)
+--   from (values (1),(2),(1),(3),(null),(2)) v(a);
+-- select array_agg(distinct a order by a desc nulls last)
+--   from (values (1),(2),(1),(3),(null),(2)) v(a);
+
+-- Skip the test below because it requires 4 UDAFs: aggf_trans, aggfns_trans, aggfstr, and aggfns
+-- multi-arg aggs, strict/nonstrict, distinct/order by
+
+-- select aggfstr(a,b,c)
+--   from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c);
+-- select aggfns(a,b,c)
+--   from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c);
+
+-- select aggfstr(distinct a,b,c)
+--   from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c),
+--        generate_series(1,3) i;
+-- select aggfns(distinct a,b,c)
+--   from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c),
+--        generate_series(1,3) i;
+
+-- select aggfstr(distinct a,b,c order by b)
+--   from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c),
+--        generate_series(1,3) i;
+-- select aggfns(distinct a,b,c order by b)
+--   from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c),
+--        generate_series(1,3) i;
+
+-- test specific code paths
+
+-- [SPARK-28768] Implement more text pattern operators
+-- select aggfns(distinct a,a,c order by c using ~<~,a)
+--   from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c),
+--        generate_series(1,2) i;
+-- select aggfns(distinct a,a,c order by c using ~<~)
+--   from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c),
+--        generate_series(1,2) i;
+-- select aggfns(distinct a,a,c order by a)
+--   from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c),
+--        generate_series(1,2) i;
+-- select aggfns(distinct a,b,c order by a,c using ~<~,b)
+--   from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c),
+--        generate_series(1,2) i;
+
+-- check node I/O via view creation and usage, also deparsing logic
+
+-- create view agg_view1 as
+--   select aggfns(a,b,c)
+--     from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c);
+
+-- select * from agg_view1;
+-- select pg_get_viewdef('agg_view1'::regclass);
+
+-- create or replace view agg_view1 as
+--   select aggfns(distinct a,b,c)
+--     from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c),
+--          generate_series(1,3) i;
+
+-- select * from agg_view1;
+-- select pg_get_viewdef('agg_view1'::regclass);
+
+-- create or replace view agg_view1 as
+--   select aggfns(distinct a,b,c order by b)
+--     from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c),
+--          generate_series(1,3) i;
+
+-- select * from agg_view1;
+-- select pg_get_viewdef('agg_view1'::regclass);
+
+-- create or replace view agg_view1 as
+--   select aggfns(a,b,c order by b+1)
+--     from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c);
+
+-- select * from agg_view1;
+-- select pg_get_viewdef('agg_view1'::regclass);
+
+-- create or replace view agg_view1 as
+--   select aggfns(a,a,c order by b)
+--     from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c);
+
+-- select * from agg_view1;
+-- select pg_get_viewdef('agg_view1'::regclass);
+
+-- create or replace view agg_view1 as
+--   select aggfns(a,b,c order by c using ~<~)
+--     from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c);
+
+-- select * from agg_view1;
+-- select pg_get_viewdef('agg_view1'::regclass);
+
+-- create or replace view agg_view1 as
+--   select aggfns(distinct a,b,c order by a,c using ~<~,b)
+--     from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c),
+--          generate_series(1,2) i;
+
+-- select * from agg_view1;
+-- select pg_get_viewdef('agg_view1'::regclass);
+
+-- drop view agg_view1;
+
+-- incorrect DISTINCT usage errors
+
+-- select aggfns(distinct a,b,c order by i)
+--   from (values (1,1,'foo')) v(a,b,c), generate_series(1,2) i;
+-- select aggfns(distinct a,b,c order by a,b+1)
+--   from (values (1,1,'foo')) v(a,b,c), generate_series(1,2) i;
+-- select aggfns(distinct a,b,c order by a,b,i,c)
+--   from (values (1,1,'foo')) v(a,b,c), generate_series(1,2) i;
+-- select aggfns(distinct a,a,c order by a,b)
+--   from (values (1,1,'foo')) v(a,b,c), generate_series(1,2) i;
+
+-- [SPARK-27978] Add built-in Aggregate Functions: string_agg
+-- string_agg tests
+-- select string_agg(a,',') from (values('aaaa'),('bbbb'),('cccc')) g(a);
+-- select string_agg(a,',') from (values('aaaa'),(null),('bbbb'),('cccc')) g(a);
+-- select string_agg(a,'AB') from (values(null),(null),('bbbb'),('cccc')) g(a);
+-- select string_agg(a,',') from (values(null),(null)) g(a);
+
+-- check some implicit casting cases, as per bug #5564
+-- select string_agg(distinct f1, ',' order by f1) from varchar_tbl;  -- ok
+-- select string_agg(distinct f1::text, ',' order by f1) from varchar_tbl;  -- not ok
+-- select string_agg(distinct f1, ',' order by f1::text) from varchar_tbl;  -- not ok
+-- select string_agg(distinct f1::text, ',' order by f1::text) from varchar_tbl;  -- ok
+
+-- [SPARK-28121] decode can not accept 'hex' as charset
+-- string_agg bytea tests
+-- CREATE TABLE bytea_test_table(v BINARY) USING parquet;
+
+-- select string_agg(v, '') from bytea_test_table;
+
+-- insert into bytea_test_table values(decode('ff','hex'));
+
+-- select string_agg(v, '') from bytea_test_table;
+
+-- insert into bytea_test_table values(decode('aa','hex'));
+
+-- select string_agg(v, '') from bytea_test_table;
+-- select string_agg(v, NULL) from bytea_test_table;
+-- select string_agg(v, decode('ee', 'hex')) from bytea_test_table;
+
+-- drop table bytea_test_table;
+
+-- [SPARK-27986] Support Aggregate Expressions with filter
+-- FILTER tests
+
+-- select min(unique1) filter (where unique1 > 100) from tenk1;
+
+-- select sum(1/ten) filter (where ten > 0) from tenk1;
+
+-- select ten, sum(distinct four) filter (where four::text ~ '123') from onek a
+-- group by ten;
+
+-- select ten, sum(distinct four) filter (where four > 10) from onek a
+-- group by ten
+-- having exists (select 1 from onek b where sum(distinct a.four) = b.four);
+
+-- [SPARK-28682] ANSI SQL: Collation Support
+-- select max(foo COLLATE "C") filter (where (bar collate "POSIX") > '0')
+-- from (values ('a', 'b')) AS v(foo,bar);
+
+-- outer reference in FILTER (PostgreSQL extension)
+select (select count(*)
+        from (values (1)) t0(inner_c))
+from (values (2),(3)) t1(outer_c); -- inner query is aggregation query
+-- select (select count(*) filter (where outer_c <> 0)
+--         from (values (1)) t0(inner_c))
+-- from (values (2),(3)) t1(outer_c); -- outer query is aggregation query
+-- select (select count(inner_c) filter (where outer_c <> 0)
+--         from (values (1)) t0(inner_c))
+-- from (values (2),(3)) t1(outer_c); -- inner query is aggregation query
+-- select
+--   (select max((select i.unique2 from tenk1 i where i.unique1 = o.unique1))
+--      filter (where o.unique1 < 10))
+-- from tenk1 o;					-- outer query is aggregation query
+
+-- subquery in FILTER clause (PostgreSQL extension)
+-- select sum(unique1) FILTER (WHERE
+--  unique1 IN (SELECT unique1 FROM onek where unique1 < 100)) FROM tenk1;
+
+-- exercise lots of aggregate parts with FILTER
+-- select aggfns(distinct a,b,c order by a,c using ~<~,b) filter (where a > 1)
+--     from (values (1,3,'foo'),(0,null,null),(2,2,'bar'),(3,1,'baz')) v(a,b,c),
+--     generate_series(1,2) i;
diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part3.sql.out b/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part3.sql.out
@@ -0,0 +1,22 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 2
+
+
+-- !query 0
+select max(min(unique1)) from tenk1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+org.apache.spark.sql.AnalysisException
+It is not allowed to use an aggregate function in the argument of another aggregate function. Please use the inner aggregate function in a sub-query.;
+
+
+-- !query 1
+select (select count(*)
+        from (values (1)) t0(inner_c))
+from (values (2),(3)) t1(outer_c)
+-- !query 1 schema
+struct<scalarsubquery():bigint>
+-- !query 1 output
+1
+1