Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions datafusion/core/src/physical_plan/unnest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,23 @@ use crate::physical_plan::{
use super::DisplayAs;

/// Unnest the given column by joining the row with each value in the nested type.
///
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the point of view of all our reference databases, this output is incorrect:
PostgreSQL:

postgres=# select * from unnest_example;
  c1   | c2 
-------+----
 {1,2} | A
       | B
 {}    | D
 {3}   | E
(4 rows)

postgres=# select unnest(c1), c2 from unnest_example;
 unnest | c2 
--------+----
      1 | A
      2 | A
      3 | E
(3 rows)

DuckDB:

D select * from unnest_example;
┌─────────┬─────────┐
│   c1    │   c2    │
│ int32[] │ varchar │
├─────────┼─────────┤
│ [1, 2]  │ A       │
│         │ B       │
│ []      │ D       │
│ [3]     │ E       │
└─────────┴─────────┘
D select unnest(c1), c2 from unnest_example;
┌────────────┬─────────┐
│ unnest(c1) │   c2    │
│   int32    │ varchar │
├────────────┼─────────┤
│          1 │ A       │
│          2 │ A       │
│          3 │ E       │
└────────────┴─────────┘

ClickHouse:

SELECT *
FROM unnest_example

Query id: be77ac76-c0ee-454a-968c-fa07afcf732d

┌─c1────┬─c2─┐
│ []    │ B  │
│ []    │ D  │
│ [1,2] │ A  │
│ [3]   │ E  │
└───────┴────┘

4 rows in set. Elapsed: 0.001 sec. 

DESKTOP-HM8OC4I. :) select arrayJoin(c1), c2 from unnest_example;

SELECT
    arrayJoin(c1),
    c2
FROM unnest_example

Query id: 3b8d0451-54f5-4a9a-8938-5505fe75a9df

┌─arrayJoin(c1)─┬─c2─┐
│             1 │ A  │
│             2 │ A  │
│             3 │ E  │
└───────────────┴────┘

3 rows in set. Elapsed: 0.001 sec.

/// For example, calling unnest(c1) results in the following:
///
/// ```text
/// ┌─────────┐ ┌─────┐ ┌─────────┐ ┌─────┐
/// │ {1, 2} │ │ A │ Unnest │ 1 │ │ A │
/// ├─────────┤ ├─────┤ ├─────────┤ ├─────┤
/// │ null │ │ B │ │ 2 │ │ A │
/// ├─────────┤ ├─────┤ ────────────▶ ├─────────┤ ├─────┤
/// │ {} │ │ D │ │ null │ │ B │
/// ├─────────┤ ├─────┤ ├─────────┤ ├─────┤
/// │ {3} │ │ E │ │ null │ │ D │
/// └─────────┘ └─────┘ ├─────────┤ ├─────┤
/// c1 c2 │ 3 │ │ E │
/// └─────────┘ └─────┘
/// c1 c2
/// ```
#[derive(Debug)]
pub struct UnnestExec {
/// Input execution plan
Expand Down
65 changes: 65 additions & 0 deletions datafusion/core/tests/dataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1114,6 +1114,71 @@ async fn unnest_fixed_list() -> Result<()> {
Ok(())
}

#[tokio::test]
async fn unnest_fixed_list_nonull() -> Result<()> {
let mut shape_id_builder = UInt32Builder::new();
let mut tags_builder = FixedSizeListBuilder::new(StringBuilder::new(), 2);

for idx in 0..6 {
// Append shape id.
shape_id_builder.append_value(idx as u32 + 1);

tags_builder
.values()
.append_value(format!("tag{}1", idx + 1));
tags_builder
.values()
.append_value(format!("tag{}2", idx + 1));
tags_builder.append(true);
}

let batch = RecordBatch::try_from_iter(vec![
("shape_id", Arc::new(shape_id_builder.finish()) as ArrayRef),
("tags", Arc::new(tags_builder.finish()) as ArrayRef),
])?;

let ctx = SessionContext::new();
ctx.register_batch("shapes", batch)?;
let df = ctx.table("shapes").await?;

let results = df.clone().collect().await?;
let expected = vec![
"+----------+----------------+",
"| shape_id | tags |",
"+----------+----------------+",
"| 1 | [tag11, tag12] |",
"| 2 | [tag21, tag22] |",
"| 3 | [tag31, tag32] |",
"| 4 | [tag41, tag42] |",
"| 5 | [tag51, tag52] |",
"| 6 | [tag61, tag62] |",
"+----------+----------------+",
];
assert_batches_sorted_eq!(expected, &results);

let results = df.unnest_column("tags")?.collect().await?;
let expected = vec![
"+----------+-------+",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's correct test:

postgres=# select * from unnest_example2;
 c1 |      c2       
----+---------------
  1 | {tag11,tag12}
  2 | {tag21,tag22}
  3 | {tag31,tag32}
  4 | {tag41,tag42}
  5 | {tag51,tag52}
  6 | {tag61,tag62}
(6 rows)

postgres=# select c1, unnest(c2) from unnest_example2;
 c1 | unnest 
----+--------
  1 | tag11
  1 | tag12
  2 | tag21
  2 | tag22
  3 | tag31
  3 | tag32
  4 | tag41
  4 | tag42
  5 | tag51
  5 | tag52
  6 | tag61
  6 | tag62
(12 rows)

"| shape_id | tags |",
"+----------+-------+",
"| 1 | tag11 |",
"| 1 | tag12 |",
"| 2 | tag21 |",
"| 2 | tag22 |",
"| 3 | tag31 |",
"| 3 | tag32 |",
"| 4 | tag41 |",
"| 4 | tag42 |",
"| 5 | tag51 |",
"| 5 | tag52 |",
"| 6 | tag61 |",
"| 6 | tag62 |",
"+----------+-------+",
];
assert_batches_sorted_eq!(expected, &results);

Ok(())
}
#[tokio::test]
async fn unnest_aggregate_columns() -> Result<()> {
const NUM_ROWS: usize = 5;
Expand Down
20 changes: 19 additions & 1 deletion datafusion/expr/src/logical_plan/plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@ pub enum LogicalPlan {
Ddl(DdlStatement),
/// Describe the schema of table
DescribeTable(DescribeTable),
/// Unnest a column that contains a nested list type.
/// Unnest a column that contains a nested list type. See
/// [`Unnest`] for more details.
Unnest(Unnest),
}

Expand Down Expand Up @@ -1753,6 +1754,23 @@ pub enum Partitioning {
}

/// Unnest a column that contains a nested list type.
///
/// For example, calling unnest(c1) results in the following:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the point of view of all our reference databases, this output is incorrect:
PostgreSQL:

postgres=# select * from unnest_example;
  c1   | c2 
-------+----
 {1,2} | A
       | B
 {}    | D
 {3}   | E
(4 rows)

postgres=# select unnest(c1), c2 from unnest_example;
 unnest | c2 
--------+----
      1 | A
      2 | A
      3 | E
(3 rows)

DuckDB:

D select * from unnest_example;
┌─────────┬─────────┐
│   c1    │   c2    │
│ int32[] │ varchar │
├─────────┼─────────┤
│ [1, 2]  │ A       │
│         │ B       │
│ []      │ D       │
│ [3]     │ E       │
└─────────┴─────────┘
D select unnest(c1), c2 from unnest_example;
┌────────────┬─────────┐
│ unnest(c1) │   c2    │
│   int32    │ varchar │
├────────────┼─────────┤
│          1 │ A       │
│          2 │ A       │
│          3 │ E       │
└────────────┴─────────┘

ClickHouse:

SELECT *
FROM unnest_example

Query id: be77ac76-c0ee-454a-968c-fa07afcf732d

┌─c1────┬─c2─┐
│ []    │ B  │
│ []    │ D  │
│ [1,2] │ A  │
│ [3]   │ E  │
└───────┴────┘

4 rows in set. Elapsed: 0.001 sec. 

DESKTOP-HM8OC4I. :) select arrayJoin(c1), c2 from unnest_example;

SELECT
    arrayJoin(c1),
    c2
FROM unnest_example

Query id: 3b8d0451-54f5-4a9a-8938-5505fe75a9df

┌─arrayJoin(c1)─┬─c2─┐
│             1 │ A  │
│             2 │ A  │
│             3 │ E  │
└───────────────┴────┘

3 rows in set. Elapsed: 0.001 sec.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the current implementation adds a row if there's a null value and would likely skip the row if the array is empty.

I can work on fixing this if that's needed

Copy link
Contributor

@vincev vincev Jul 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes when I implemented unnest_column I followed the logic in Pandas and Polars:

          ints  shape_id
[3, 88, 94]         1
       [73]         2
       None         3
   [43, 97]         4
       None         5
       [65]         6

by unnesting ints we get:

   ints  shape_id
   3         1
  88         1
  94         1
  73         2
None         3
  43         4
  97         4
None         5
  65         6

that I think is better because you preserve all the information that was in the nested data and if then you want to nest the data again you get back the same data you started with.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you decide to implement the same behavior you have in DuckDB would be nice to have an option to preserve the old behavior.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have the code ready to change this but need someone to confirm if this is what we want.
To support both ways, we could introduce some extra boolean flag like keep_nulls.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a keep_nulls flag sounds like a good idea to me

So is the consensus for "duckdb/clickhouse" behavior by default, with a flag for keep_nulls? If so, I can update the examples in this PR

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe to avoid breaking API changes to the LogicalPlanBuilder::unnest_column interface we can add a LogicalPlanBuilder::unnest_with_options method that will set the flag in the LogicalPlan::Unnest struct.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll have a play with the API and make a proposal

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is a proposed API -- please let me know what you think: #7088

///
/// ```text
/// ┌─────────┐ ┌─────┐ ┌─────────┐ ┌─────┐
/// │ {1, 2} │ │ A │ Unnest │ 1 │ │ A │
/// ├─────────┤ ├─────┤ ├─────────┤ ├─────┤
/// │ null │ │ B │ │ 2 │ │ A │
/// ├─────────┤ ├─────┤ ────────────▶ ├─────────┤ ├─────┤
/// │ {} │ │ D │ │ null │ │ B │
/// ├─────────┤ ├─────┤ ├─────────┤ ├─────┤
/// │ {3} │ │ E │ │ null │ │ D │
/// └─────────┘ └─────┘ ├─────────┤ ├─────┤
/// c1 c2 │ 3 │ │ E │
/// └─────────┘ └─────┘
/// c1 c2
/// ```
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Unnest {
/// The incoming logical plan
Expand Down