Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 85 additions & 30 deletions datafusion/physical-plan/src/joins/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,7 @@ fn estimate_join_cardinality(
right_stats: Statistics,
on: &JoinOn,
) -> Option<PartialJoinStatistics> {
let (left_col_stats, right_col_stats) = on
let (left_key_stats, right_key_stats) = on
.iter()
.map(|(left, right)| {
match (
Expand All @@ -500,12 +500,12 @@ fn estimate_join_cardinality(
Statistics {
num_rows: left_stats.num_rows,
total_byte_size: Precision::Absent,
column_statistics: left_col_stats,
column_statistics: left_key_stats,
},
Statistics {
num_rows: right_stats.num_rows,
total_byte_size: Precision::Absent,
column_statistics: right_col_stats,
column_statistics: right_key_stats,
},
)?;

Expand Down Expand Up @@ -545,38 +545,49 @@ fn estimate_join_cardinality(
let is_left = matches!(join_type, JoinType::LeftSemi | JoinType::LeftAnti);
let is_anti = matches!(join_type, JoinType::LeftAnti | JoinType::RightAnti);

let ((outer_stats, inner_stats), (outer_col_stats, inner_col_stats)) =
if is_left {
(
(&left_stats, &right_stats),
(&left_col_stats, &right_col_stats),
)
} else {
(
(&right_stats, &left_stats),
(&right_col_stats, &left_col_stats),
)
};
let (outer_stats, inner_stats, outer_key_stats, inner_key_stats) = if is_left
{
(&left_stats, &right_stats, &left_key_stats, &right_key_stats)
} else {
(&right_stats, &left_stats, &right_key_stats, &left_key_stats)
};

let outer_rows = *outer_stats.num_rows.get_value()?;

let cardinality =
if estimate_disjoint_inputs(outer_stats, inner_stats).is_some() {
// Disjoint inputs: semi produces 0, anti keeps all rows.
if is_anti { outer_rows } else { 0 }
let outer_join_key_stats = Statistics {
num_rows: outer_stats.num_rows,
total_byte_size: Precision::Absent,
column_statistics: outer_key_stats.clone(),
};
let inner_join_key_stats = Statistics {
num_rows: inner_stats.num_rows,
total_byte_size: Precision::Absent,
column_statistics: inner_key_stats.clone(),
};

let semi_cardinality =
if estimate_disjoint_inputs(&outer_join_key_stats, &inner_join_key_stats)
.is_some()
{
// If join keys are disjoint, no rows will match
Some(0)
} else {
match estimate_semi_join_cardinality(
estimate_semi_join_cardinality(
&outer_stats.num_rows,
&inner_stats.num_rows,
outer_col_stats,
inner_col_stats,
) {
Some(semi) if is_anti => outer_rows.saturating_sub(semi),
Some(semi) => semi,
None => outer_rows,
}
outer_key_stats,
inner_key_stats,
)
};

// Semi joins keep the matching rows; anti joins keep the rest. When no
// estimate is available, conservatively assume all outer rows pass.
let cardinality = match (semi_cardinality, is_anti) {
(Some(semi), true) => outer_rows.saturating_sub(semi),
(Some(semi), false) => semi,
(None, _) => outer_rows,
};

let outer_stats = if is_left { left_stats } else { right_stats };
Some(PartialJoinStatistics {
num_rows: cardinality,
Expand Down Expand Up @@ -759,8 +770,8 @@ fn estimate_disjoint_inputs(
fn estimate_semi_join_cardinality(
outer_num_rows: &Precision<usize>,
inner_num_rows: &Precision<usize>,
outer_col_stats: &[ColumnStatistics],
inner_col_stats: &[ColumnStatistics],
outer_key_stats: &[ColumnStatistics],
inner_key_stats: &[ColumnStatistics],
) -> Option<usize> {
let outer_rows = *outer_num_rows.get_value()?;
if outer_rows == 0 {
Expand All @@ -774,7 +785,7 @@ fn estimate_semi_join_cardinality(
let mut selectivity = 1.0_f64;
let mut has_selectivity_estimate = false;

for (outer_stat, inner_stat) in outer_col_stats.iter().zip(inner_col_stats.iter()) {
for (outer_stat, inner_stat) in outer_key_stats.iter().zip(inner_key_stats.iter()) {
let outer_has_stats = outer_stat.distinct_count.get_value().is_some()
|| (outer_stat.min_value.get_value().is_some()
&& outer_stat.max_value.get_value().is_some());
Expand Down Expand Up @@ -3246,6 +3257,50 @@ mod tests {
Ok(())
}

#[test]
fn test_semi_anti_join_disjoint_check_uses_only_join_keys() {
let join_on = vec![(
Arc::new(Column::new("l_key", 0)) as _,
Arc::new(Column::new("r_key", 0)) as _,
)];

// Ranges for the join key overlap; ranges for the other column are disjoint
let left_stats = Statistics {
num_rows: Inexact(50),
total_byte_size: Absent,
column_statistics: vec![
create_column_stats(Inexact(1), Inexact(10), Absent, Absent),
create_column_stats(Inexact(100), Inexact(200), Absent, Absent),
],
};
let right_stats = Statistics {
num_rows: Inexact(10),
total_byte_size: Absent,
column_statistics: vec![
create_column_stats(Inexact(1), Inexact(10), Absent, Absent),
create_column_stats(Inexact(1000), Inexact(2000), Absent, Absent),
],
};

let left_semi = estimate_join_cardinality(
&JoinType::LeftSemi,
left_stats.clone(),
right_stats.clone(),
&join_on,
)
.map(|c| c.num_rows);
assert_eq!(left_semi, Some(50));

let left_anti = estimate_join_cardinality(
&JoinType::LeftAnti,
left_stats,
right_stats,
&join_on,
)
.map(|c| c.num_rows);
assert_eq!(left_anti, Some(0));
}

#[test]
fn test_calculate_join_output_ordering() -> Result<()> {
let left_ordering = LexOrdering::new(vec![
Expand Down
Loading