-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-42746][SQL] Implement LISTAGG function #48748
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0fecbd9
99cf932
db513cf
68ed739
c56f291
d8460c8
864f658
1de7ffe
274a96b
d23654c
8347939
412b8e7
90b2f2a
20b45dc
7c912d5
75b234a
f048dc9
77825a7
ce093b5
f2a1fb7
19fdafc
9ae3872
8e3aae3
8a3f705
c0f1496
2fcd373
bd088a4
269aca3
dd0bfaf
433e49b
6f54ab0
b0dc017
87999f4
a0f0e5d
8ba2466
ed46b31
885e812
8ba8566
9e70cc5
3382056
0f64921
d69ad1f
6f74b67
5050630
b75855d
3f2e296
14ee65e
e638d29
dfcc112
3cbe9e9
7105c7c
27cbd03
d514787
5fd9a30
516567a
27f445d
fc722df
e3b1a26
ad49fcf
9745a83
0aca46c
ca5b13a
5f37ae3
056ec61
cb5ad3e
07dfd82
be68e20
6a9c1fe
0efedf3
811c36c
9c5bd3d
e6d9c70
0bbd8af
d96ac1e
aee0ac5
91b759f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -135,27 +135,57 @@ public static byte[] subStringSQL(byte[] bytes, int pos, int len) { | |
return Arrays.copyOfRange(bytes, start, end); | ||
} | ||
|
||
/** | ||
* Concatenate multiple byte arrays into one. | ||
* If one of the inputs is null then null will be returned. | ||
* | ||
* @param inputs byte arrays to concatenate | ||
* @return the concatenated byte array or null if one of the arguments is null | ||
*/ | ||
public static byte[] concat(byte[]... inputs) { | ||
return concatWS(EMPTY_BYTE, inputs); | ||
} | ||
|
||
/** | ||
* Concatenate multiple byte arrays with a given delimiter. | ||
* If the delimiter or one of the inputs is null then null will be returned. | ||
* | ||
* @param delimiter byte array to be placed between each input | ||
* @param inputs byte arrays to concatenate | ||
* @return the concatenated byte array or null if one of the arguments is null | ||
*/ | ||
public static byte[] concatWS(byte[] delimiter, byte[]... inputs) { | ||
if (delimiter == null) { | ||
return null; | ||
} | ||
// Compute the total length of the result | ||
long totalLength = 0; | ||
for (byte[] input : inputs) { | ||
if (input != null) { | ||
totalLength += input.length; | ||
totalLength += input.length + delimiter.length; | ||
} else { | ||
return null; | ||
} | ||
} | ||
|
||
if (totalLength > 0) totalLength -= delimiter.length; | ||
// Allocate a new byte array, and copy the inputs one by one into it | ||
final byte[] result = new byte[Ints.checkedCast(totalLength)]; | ||
int offset = 0; | ||
for (byte[] input : inputs) { | ||
for (int i = 0; i < inputs.length; i++) { | ||
byte[] input = inputs[i]; | ||
int len = input.length; | ||
Platform.copyMemory( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this seems copied from L154 above, please dedup into one place? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't want to accidentally change existing behavior or performance so I thought a little copy-paste was justified in this isolated code. But I probably concern too much) |
||
input, Platform.BYTE_ARRAY_OFFSET, | ||
result, Platform.BYTE_ARRAY_OFFSET + offset, | ||
len); | ||
offset += len; | ||
if (delimiter.length > 0 && i < inputs.length - 1) { | ||
Platform.copyMemory( | ||
delimiter, Platform.BYTE_ARRAY_OFFSET, | ||
result, Platform.BYTE_ARRAY_OFFSET + offset, | ||
delimiter.length); | ||
offset += delimiter.length; | ||
} | ||
} | ||
return result; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1147,6 +1147,77 @@ object functions { | |
*/ | ||
def sum_distinct(e: Column): Column = Column.fn("sum", isDistinct = true, e) | ||
|
||
/** | ||
* Aggregate function: returns the concatenation of non-null input values. | ||
* | ||
* @group agg_funcs | ||
* @since 4.0.0 | ||
*/ | ||
def listagg(e: Column): Column = Column.fn("listagg", e) | ||
|
||
/** | ||
* Aggregate function: returns the concatenation of non-null input values, separated by the | ||
* delimiter. | ||
* | ||
* @group agg_funcs | ||
* @since 4.0.0 | ||
*/ | ||
def listagg(e: Column, delimiter: Column): Column = Column.fn("listagg", e, delimiter) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Declaring the delimiter as String here can improve UX a bit. Since it only allows foldable string literals, we can rely on the compiler instead of runtime errors, WDYT @cloud-fan There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SGTM |
||
|
||
/** | ||
* Aggregate function: returns the concatenation of distinct non-null input values. | ||
* | ||
* @group agg_funcs | ||
* @since 4.0.0 | ||
*/ | ||
def listagg_distinct(e: Column): Column = Column.fn("listagg", isDistinct = true, e) | ||
|
||
/** | ||
* Aggregate function: returns the concatenation of distinct non-null input values, separated by | ||
* the delimiter. | ||
* | ||
* @group agg_funcs | ||
* @since 4.0.0 | ||
*/ | ||
def listagg_distinct(e: Column, delimiter: Column): Column = | ||
Column.fn("listagg", isDistinct = true, e, delimiter) | ||
|
||
/** | ||
* Aggregate function: returns the concatenation of non-null input values. Alias for `listagg`. | ||
* | ||
* @group agg_funcs | ||
* @since 4.0.0 | ||
*/ | ||
def string_agg(e: Column): Column = Column.fn("string_agg", e) | ||
|
||
/** | ||
* Aggregate function: returns the concatenation of non-null input values, separated by the | ||
* delimiter. Alias for `listagg`. | ||
* | ||
* @group agg_funcs | ||
* @since 4.0.0 | ||
*/ | ||
def string_agg(e: Column, delimiter: Column): Column = Column.fn("string_agg", e, delimiter) | ||
|
||
/** | ||
* Aggregate function: returns the concatenation of distinct non-null input values. Alias for | ||
* `listagg`. | ||
* | ||
* @group agg_funcs | ||
* @since 4.0.0 | ||
*/ | ||
def string_agg_distinct(e: Column): Column = Column.fn("string_agg", isDistinct = true, e) | ||
|
||
/** | ||
* Aggregate function: returns the concatenation of distinct non-null input values, separated by | ||
* the delimiter. Alias for `listagg`. | ||
* | ||
* @group agg_funcs | ||
* @since 4.0.0 | ||
*/ | ||
def string_agg_distinct(e: Column, delimiter: Column): Column = | ||
Column.fn("string_agg", isDistinct = true, e, delimiter) | ||
|
||
/** | ||
* Aggregate function: alias for `var_samp`. | ||
* | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you please add a comment saying what this function is doing?