apache · dawidwys · Feb 7, 2024 · Sep 8, 2023 · Dec 15, 2023 · Dec 18, 2023
diff --git a/docs/data/sql_functions.yml b/docs/data/sql_functions.yml
@@ -1059,6 +1059,13 @@ aggregate:
       Divides the rows for each window partition into `n` buckets ranging from 1 to at most `n`.
       If the number of rows in the window partition doesn't divide evenly into the number of buckets, then the remainder values are distributed one per bucket, starting with the first bucket.
       For example, with 6 rows and 4 buckets, the bucket values would be as follows: 1 1 2 2 3 4
+  - sql: ARRAY_AGG([ ALL | DISTINCT ] expression [ RESPECT NULLS | IGNORE NULLS ])
+    table: FIELD.arrayAgg
+    description: |
+      By default or with keyword `ALL` and, return an array that concatenates the input rows
+      and returns `NULL` if there are no input rows. Use `DISTINCT` for one unique instance of each value.
+      By default `NULL` values are respected, use `IGNORE NULLS` to skip `NULL` values.
+      The `ORDER BY` clause is currently not supported.
   - sql: JSON_OBJECTAGG([KEY] key VALUE value [ { NULL | ABSENT } ON NULL ])
     table: jsonObjectAgg(JsonOnNull, keyExpression, valueExpression)
     description: |

diff --git a/docs/data/sql_functions_zh.yml b/docs/data/sql_functions_zh.yml
@@ -1181,6 +1181,12 @@ aggregate:
       将窗口分区中的所有数据按照顺序划分为 n 个分组，返回分配给各行数据的分组编号（从 1 开始，最大为 n）。
       如果不能均匀划分为 n 个分组，则剩余值从第 1 个分组开始，为每一分组分配一个。
       比如某个窗口分区有 6 行数据，划分为 4 个分组，则各行的分组编号为：1，1，2，2，3，4。
+  - sql: ARRAY_AGG([ ALL | DISTINCT ] expression [ RESPECT NULLS | IGNORE NULLS ])
+    table: FIELD.arrayAgg
+    description: |
+      默认情况下或使用关键字ALL，返回输入行中表达式所组成的数组，并且如果没有输入行，则返回 `NULL`。使用 `DISTINCT` 则对所有值去重后计算。
+      默认情况下`NULL` 值不会被忽略，使用 `IGNORE NULLS` 忽略 `NULL` 值。
+      目前尚不支持 `ORDER BY` 子句。
   - sql: JSON_OBJECTAGG([KEY] key VALUE value [ { NULL | ABSENT } ON NULL ])
     table: jsonObjectAgg(JsonOnNull, keyExpression, valueExpression)
     description: |

diff --git a/flink-python/docs/reference/pyflink.table/expressions.rst b/flink-python/docs/reference/pyflink.table/expressions.rst
@@ -138,6 +138,7 @@ arithmetic functions
     Expression.var_pop
     Expression.var_samp
     Expression.collect
+    Expression.array_agg
     Expression.alias
     Expression.cast
     Expression.try_cast

diff --git a/flink-python/pyflink/table/expression.py b/flink-python/pyflink/table/expression.py
@@ -832,6 +832,10 @@ def var_samp(self) -> 'Expression':
     def collect(self) -> 'Expression':
         return _unary_op("collect")(self)
 
+    @property
+    def array_agg(self) -> 'Expression':
+        return _unary_op("arrayAgg")(self)
+
     def alias(self, name: str, *extra_names: str) -> 'Expression[T]':
         """
         Specifies a name for an expression i.e. a field.

diff --git a/flink-python/pyflink/table/tests/test_expression.py b/flink-python/pyflink/table/tests/test_expression.py
@@ -114,6 +114,7 @@ def test_expression(self):
         self.assertEqual('varPop(a)', str(expr1.var_pop))
         self.assertEqual('varSamp(a)', str(expr1.var_samp))
         self.assertEqual('collect(a)', str(expr1.collect))
+        self.assertEqual('ARRAY_AGG(a)', str(expr1.array_agg))
         self.assertEqual("as(a, 'a', 'b', 'c')", str(expr1.alias('a', 'b', 'c')))
         self.assertEqual('cast(a, INT)', str(expr1.cast(DataTypes.INT())))
         self.assertEqual('asc(a)', str(expr1.asc))

diff --git a/flink-table/flink-sql-parser/src/main/codegen/data/Parser.tdd b/flink-table/flink-sql-parser/src/main/codegen/data/Parser.tdd
@@ -201,6 +201,7 @@
     "AFTER"
     "ALWAYS"
     "APPLY"
+    "ARRAY_AGG"
     "ASC"
     "ASSERTION"
     "ASSIGNMENT"

diff --git a/...le/flink-sql-parser/src/test/java/org/apache/flink/sql/parser/FlinkSqlParserImplTest.java b/...le/flink-sql-parser/src/test/java/org/apache/flink/sql/parser/FlinkSqlParserImplTest.java
@@ -63,12 +63,23 @@ void testDescribeCatalog() {
         sql("desc catalog a").ok("DESCRIBE CATALOG `A`");
     }
 
-    // ignore test methods that we don't support
-    // BEGIN
-    // ARRAY_AGG
-    @Disabled
     @Test
-    void testArrayAgg() {}
+    void testArrayAgg() {
+        sql("select\n"
+                        + "  array_agg(ename respect nulls order by deptno, ename) as c1,\n"
+                        + "  array_agg(ename order by deptno, ename desc) as c2,\n"
+                        + "  array_agg(distinct ename) as c3,\n"
+                        + "  array_agg(ename) as c4\n"
+                        + "from emp group by gender")
+                .ok(
+                        "SELECT"
+                                + " ARRAY_AGG(`ENAME` ORDER BY `DEPTNO`, `ENAME`) RESPECT NULLS AS `C1`,"
+                                + " ARRAY_AGG(`ENAME` ORDER BY `DEPTNO`, `ENAME` DESC) AS `C2`,"
+                                + " ARRAY_AGG(DISTINCT `ENAME`) AS `C3`,"
+                                + " ARRAY_AGG(`ENAME`) AS `C4`\n"
+                                + "FROM `EMP`\n"
+                                + "GROUP BY `GENDER`");
+    }
 
     // DESCRIBE SCHEMA
     @Disabled

diff --git a/...ink-table-api-java/src/main/java/org/apache/flink/table/api/internal/BaseExpressions.java b/...ink-table-api-java/src/main/java/org/apache/flink/table/api/internal/BaseExpressions.java
@@ -53,6 +53,7 @@
 import static org.apache.flink.table.functions.BuiltInFunctionDefinitions.ABS;
 import static org.apache.flink.table.functions.BuiltInFunctionDefinitions.ACOS;
 import static org.apache.flink.table.functions.BuiltInFunctionDefinitions.AND;
+import static org.apache.flink.table.functions.BuiltInFunctionDefinitions.ARRAY_AGG;
 import static org.apache.flink.table.functions.BuiltInFunctionDefinitions.ARRAY_CONCAT;
 import static org.apache.flink.table.functions.BuiltInFunctionDefinitions.ARRAY_CONTAINS;
 import static org.apache.flink.table.functions.BuiltInFunctionDefinitions.ARRAY_DISTINCT;
@@ -527,6 +528,11 @@ public OutType collect() {
         return toApiSpecificExpression(unresolvedCall(COLLECT, toExpr()));
     }
 
+    /** Returns array aggregate of a given expression. */
+    public OutType arrayAgg() {
+        return toApiSpecificExpression(unresolvedCall(ARRAY_AGG, toExpr()));
+    }
+
     /**
      * Returns a new value being cast to {@code toType}. A cast error throws an exception and fails
      * the job. When performing a cast operation that may fail, like {@link DataTypes#STRING()} to

diff --git a/...ble-common/src/main/java/org/apache/flink/table/functions/BuiltInFunctionDefinitions.java b/...ble-common/src/main/java/org/apache/flink/table/functions/BuiltInFunctionDefinitions.java
@@ -745,6 +745,13 @@ ANY, and(logical(LogicalTypeRoot.BOOLEAN), LITERAL)
                     .outputTypeStrategy(argument(0))
                     .build();
 
+    public static final BuiltInFunctionDefinition ARRAY_AGG =
+            BuiltInFunctionDefinition.newBuilder()
+                    .name("ARRAY_AGG")
+                    .kind(AGGREGATE)
+                    .outputTypeStrategy(nullableIfArgs(SpecificTypeStrategies.ARRAY))
+                    .build();
+
     // --------------------------------------------------------------------------------------------
     // String functions
     // --------------------------------------------------------------------------------------------

diff --git a/...anner/src/main/java/org/apache/flink/table/planner/expressions/SqlAggFunctionVisitor.java b/...anner/src/main/java/org/apache/flink/table/planner/expressions/SqlAggFunctionVisitor.java
@@ -84,6 +84,8 @@ public class SqlAggFunctionVisitor extends ExpressionDefaultVisitor<SqlAggFuncti
                 BuiltInFunctionDefinitions.VAR_SAMP, FlinkSqlOperatorTable.VAR_SAMP);
         AGG_DEF_SQL_OPERATOR_MAPPING.put(
                 BuiltInFunctionDefinitions.COLLECT, FlinkSqlOperatorTable.COLLECT);
+        AGG_DEF_SQL_OPERATOR_MAPPING.put(
+                BuiltInFunctionDefinitions.ARRAY_AGG, FlinkSqlOperatorTable.ARRAY_AGG);
         AGG_DEF_SQL_OPERATOR_MAPPING.put(
                 BuiltInFunctionDefinitions.JSON_OBJECTAGG_NULL_ON_NULL,
                 FlinkSqlOperatorTable.JSON_OBJECTAGG_NULL_ON_NULL);

diff --git a/...ner/src/main/java/org/apache/flink/table/planner/functions/sql/FlinkSqlOperatorTable.java b/...ner/src/main/java/org/apache/flink/table/planner/functions/sql/FlinkSqlOperatorTable.java
@@ -37,6 +37,8 @@
 import org.apache.calcite.sql.SqlPrefixOperator;
 import org.apache.calcite.sql.SqlSpecialOperator;
 import org.apache.calcite.sql.SqlSyntax;
+import org.apache.calcite.sql.fun.SqlBasicAggFunction;
+import org.apache.calcite.sql.fun.SqlLibraryOperators;
 import org.apache.calcite.sql.fun.SqlStdOperatorTable;
 import org.apache.calcite.sql.type.InferTypes;
 import org.apache.calcite.sql.type.OperandTypes;
@@ -1140,6 +1142,22 @@ public List<SqlGroupedWindowFunction> getAuxiliaryFunctions() {
     public static final SqlAggFunction APPROX_COUNT_DISTINCT =
             SqlStdOperatorTable.APPROX_COUNT_DISTINCT;
 
+    /**
+     * Use the definitions in Flink instead of {@link SqlLibraryOperators#ARRAY_AGG}, because we
+     * return nullable ARRAY type. Order by clause like <code>ARRAY_AGG(x ORDER BY x, y)</code> for
+     * aggregate function is not supported yet, because the row data cannot be obtained inside the
+     * aggregate function.
+     */
+    public static final SqlAggFunction ARRAY_AGG =
+            SqlBasicAggFunction.create(
+                            SqlKind.ARRAY_AGG,
+                            ReturnTypes.cascade(
+                                    ReturnTypes.TO_ARRAY, SqlTypeTransforms.TO_NULLABLE),
+                            OperandTypes.ANY)
+                    .withFunctionType(SqlFunctionCategory.SYSTEM)
+                    .withSyntax(SqlSyntax.FUNCTION)
+                    .withAllowsNullTreatment(true);
+
     // ARRAY OPERATORS
     public static final SqlOperator ARRAY_VALUE_CONSTRUCTOR = new SqlArrayConstructor();
     public static final SqlOperator ELEMENT = SqlStdOperatorTable.ELEMENT;
@@ -1154,6 +1172,8 @@ public List<SqlGroupedWindowFunction> getAuxiliaryFunctions() {
     // SPECIAL OPERATORS
     public static final SqlOperator MULTISET_VALUE = SqlStdOperatorTable.MULTISET_VALUE;
     public static final SqlOperator ROW = SqlStdOperatorTable.ROW;
+    public static final SqlOperator IGNORE_NULLS = SqlStdOperatorTable.IGNORE_NULLS;
+    public static final SqlOperator RESPECT_NULLS = SqlStdOperatorTable.RESPECT_NULLS;
     public static final SqlOperator OVERLAPS = SqlStdOperatorTable.OVERLAPS;
     public static final SqlOperator LITERAL_CHAIN = SqlStdOperatorTable.LITERAL_CHAIN;
     public static final SqlOperator BETWEEN = SqlStdOperatorTable.BETWEEN;

diff --git a/...planner/src/main/scala/org/apache/flink/table/planner/plan/utils/AggFunctionFactory.scala b/...planner/src/main/scala/org/apache/flink/table/planner/plan/utils/AggFunctionFactory.scala
@@ -146,6 +146,9 @@ class AggFunctionFactory(
       case a: SqlAggFunction if a.getKind == SqlKind.COLLECT =>
         createCollectAggFunction(argTypes)
 
+      case a: SqlAggFunction if a.getKind == SqlKind.ARRAY_AGG =>
+        createArrayAggFunction(argTypes, call.ignoreNulls)
+
       case fn: SqlAggFunction if fn.getKind == SqlKind.JSON_OBJECTAGG =>
         val onNull = fn.asInstanceOf[SqlJsonObjectAggAggFunction].getNullClause
         new JsonObjectAggFunction(argTypes, onNull == SqlJsonConstructorNullClause.ABSENT_ON_NULL)
@@ -620,4 +623,10 @@ class AggFunctionFactory(
   private def createCollectAggFunction(argTypes: Array[LogicalType]): UserDefinedFunction = {
     new CollectAggFunction(argTypes(0))
   }
+
+  private def createArrayAggFunction(
+      types: Array[LogicalType],
+      ignoreNulls: Boolean): UserDefinedFunction = {
+    new ArrayAggFunction(types(0), ignoreNulls)
+  }
 }
diff --git a/...able-planner/src/main/scala/org/apache/flink/table/planner/plan/utils/AggregateUtil.scala b/...able-planner/src/main/scala/org/apache/flink/table/planner/plan/utils/AggregateUtil.scala
@@ -849,7 +849,7 @@ object AggregateUtil extends Enumeration {
             call.getAggregation,
             false,
             false,
-            false,
+            call.ignoreNulls,
             call.getArgList,
             -1, // remove filterArg
             null,

diff --git a/...lanner/src/test/java/org/apache/flink/table/planner/functions/ArrayAggFunctionITCase.java b/...lanner/src/test/java/org/apache/flink/table/planner/functions/ArrayAggFunctionITCase.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.planner.functions;
+
+import org.apache.flink.table.functions.BuiltInFunctionDefinitions;
+import org.apache.flink.types.Row;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.stream.Stream;
+
+import static org.apache.flink.table.api.DataTypes.ARRAY;
+import static org.apache.flink.table.api.DataTypes.INT;
+import static org.apache.flink.table.api.DataTypes.ROW;
+import static org.apache.flink.table.api.DataTypes.STRING;
+import static org.apache.flink.table.api.Expressions.$;
+import static org.apache.flink.types.RowKind.DELETE;
+import static org.apache.flink.types.RowKind.INSERT;
+import static org.apache.flink.types.RowKind.UPDATE_AFTER;
+import static org.apache.flink.types.RowKind.UPDATE_BEFORE;
+
+/** Tests for built-in ARRAY_AGG aggregation functions. */
+class ArrayAggFunctionITCase extends BuiltInAggregateFunctionTestBase {
+
+    @Override
+    Stream<TestSpec> getTestCaseSpecs() {
+        return Stream.of(
+                TestSpec.forFunction(BuiltInFunctionDefinitions.ARRAY_AGG)
+                        .withDescription("ARRAY changelog stream aggregation")
+                        .withSource(
+                                ROW(STRING(), INT()),
+                                Arrays.asList(
+                                        Row.ofKind(INSERT, "A", 1),
+                                        Row.ofKind(INSERT, "A", 2),
+                                        Row.ofKind(INSERT, "B", 2),
+                                        Row.ofKind(INSERT, "B", 2),
+                                        Row.ofKind(INSERT, "B", 3),
+                                        Row.ofKind(INSERT, "C", 3),
+                                        Row.ofKind(INSERT, "C", null),
+                                        Row.ofKind(DELETE, "C", null),
+                                        Row.ofKind(INSERT, "D", null),
+                                        Row.ofKind(INSERT, "E", 4),
+                                        Row.ofKind(INSERT, "E", 5),
+                                        Row.ofKind(DELETE, "E", 5),
+                                        Row.ofKind(UPDATE_BEFORE, "E", 4),
+                                        Row.ofKind(UPDATE_AFTER, "E", 6)))
+                        .testResult(
+                                source ->
+                                        "SELECT f0, array_agg(f1) FROM " + source + " GROUP BY f0",
+                                TableApiAggSpec.groupBySelect(
+                                        Collections.singletonList($("f0")),
+                                        $("f0"),
+                                        $("f1").arrayAgg()),
+                                ROW(STRING(), ARRAY(INT())),
+                                ROW(STRING(), ARRAY(INT())),
+                                Arrays.asList(
+                                        Row.of("A", new Integer[] {1, 2}),
+                                        Row.of("B", new Integer[] {2, 2, 3}),
+                                        Row.of("C", new Integer[] {3}),
 AggregateCall.create( 
   call.getAggregation, 
   false, 
   false, 
   false, 
   call.getArgList, 
   -1, // remove filterArg 
   null, 
   RelCollations.EMPTY, 
   call.getType, 
   call.getName) 
  * <p>Note: Elements of a {@link ListView} must not be null. For heap-based state backends, {@code 
  * hashCode/equals} of the original (i.e. external) class are used. However, the serialization 
  * format will use internal data structures. 
 private static final StringData NULL_STR = StringData.fromString("null"); 
 public static class LagAcc<T> { 
     public int offset = 1; 
     public T defaultValue = null; 
     public LinkedList<T> buffer = new LinkedList<>(); 
 AggregateCall.create( 
   call.getAggregation, 
   false, 
   false, 
   false, 
   call.getArgList, 
   -1, // remove filterArg 
   null, 
   RelCollations.EMPTY, 
   call.getType, 
   call.getName) 
  * <p>Note: Elements of a {@link ListView} must not be null. For heap-based state backends, {@code 
  * hashCode/equals} of the original (i.e. external) class are used. However, the serialization 
  * format will use internal data structures. 
 private static final StringData NULL_STR = StringData.fromString("null"); 
 public static class LagAcc<T> { 
     public int offset = 1; 
     public T defaultValue = null; 
     public LinkedList<T> buffer = new LinkedList<>(); 
+                                        Row.of("D", new Integer[] {null}),
+                                        Row.of("E", new Integer[] {6})))
+                        .testSqlResult(
+                                source ->
+                                        "SELECT f0, array_agg(DISTINCT f1 IGNORE NULLS) FROM "
+                                                + source
+                                                + " GROUP BY f0",
+                                ROW(STRING(), ARRAY(INT())),
+                                Arrays.asList(
+                                        Row.of("A", new Integer[] {1, 2}),
+                                        Row.of("B", new Integer[] {2, 3}),
+                                        Row.of("C", new Integer[] {3}),
+                                        Row.of("D", null),
+                                        Row.of("E", new Integer[] {6}))));
+    }
+}