Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions datafusion/core/tests/sql/joins.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1428,9 +1428,9 @@ async fn reduce_left_join_1() -> Result<()> {
"Explain [plan_type:Utf8, plan:Utf8]",
" Projection: t1.t1_id, t1.t1_name, t1.t1_int, t2.t2_id, t2.t2_name, t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" Filter: CAST(t1.t1_id AS Int64) < Int64(100) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎉 the casts have been removed!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was the original regression noted in #3699 (comment) that lead to #3702

" Filter: t1.t1_id < UInt32(100) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]",
" TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]",
" Filter: CAST(t2.t2_id AS Int64) < Int64(100) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" Filter: t2.t2_id < UInt32(100) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
];
let formatted = plan.display_indent_schema().to_string();
Expand Down Expand Up @@ -1476,10 +1476,10 @@ async fn reduce_left_join_2() -> Result<()> {
let expected = vec![
"Explain [plan_type:Utf8, plan:Utf8]",
" Projection: t1.t1_id, t1.t1_name, t1.t1_int, t2.t2_id, t2.t2_name, t2.t2_int [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" Filter: CAST(t2.t2_int AS Int64) < Int64(10) OR CAST(t1.t1_int AS Int64) > Int64(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" Filter: t2.t2_int < UInt32(10) OR t1.t1_int > UInt32(2) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]",
" Filter: CAST(t2.t2_int AS Int64) < Int64(10) OR t2.t2_name != Utf8(\"w\") [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" Filter: t2.t2_int < UInt32(10) OR t2.t2_name != Utf8(\"w\") [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
];
let formatted = plan.display_indent_schema().to_string();
Expand Down Expand Up @@ -1524,9 +1524,9 @@ async fn reduce_left_join_3() -> Result<()> {
" Projection: t3.t1_id, t3.t1_name, t3.t1_int, alias=t3 [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]",
" Projection: t1.t1_id, t1.t1_name, t1.t1_int, alias=t3 [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]",
" Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" Filter: CAST(t1.t1_id AS Int64) < Int64(100) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]",
" Filter: t1.t1_id < UInt32(100) [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]",
" TableScan: t1 projection=[t1_id, t1_name, t1_int] [t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]",
" Filter: CAST(t2.t2_int AS Int64) < Int64(3) AND CAST(t2.t2_id AS Int64) < Int64(100) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" Filter: t2.t2_int < UInt32(3) AND t2.t2_id < UInt32(100) [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
" TableScan: t2 projection=[t2_id, t2_name, t2_int] [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
];
Expand Down
126 changes: 111 additions & 15 deletions datafusion/optimizer/src/unwrap_cast_in_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,11 @@ fn is_comparison_op(op: &Operator) -> bool {
fn is_support_data_type(data_type: &DataType) -> bool {
matches!(
data_type,
DataType::Int8
DataType::UInt8
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64
| DataType::Int8
| DataType::Int16
| DataType::Int32
| DataType::Int64
Expand All @@ -292,6 +296,25 @@ fn is_support_data_type(data_type: &DataType) -> bool {
)
}

fn is_decimal_type(dt: &DataType) -> bool {
matches!(dt, DataType::Decimal128(_, _))
}

fn is_unsigned_type(dt: &DataType) -> bool {
matches!(
dt,
DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64
)
}

/// Until https://github.com/apache/arrow-rs/issues/1043 is done
/// (support for unsigned <--> decimal casts) we also don't do that
/// kind of cast in this optimizer
fn is_unsupported_cast(dt1: &DataType, dt2: &DataType) -> bool {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found this with the test (which failed when it tried to invoke the arrow cast kernels for decimal <--> unsigned)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Casting for unsigned value <-> decimal is not supported now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will try to implement them recently.

Copy link
Contributor

@liukun4515 liukun4515 Nov 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(is_decimal_type(dt1) && is_unsigned_type(dt2))
|| (is_decimal_type(dt2) && is_unsigned_type(dt1))
}

fn try_cast_literal_to_type(
lit_value: &ScalarValue,
target_type: &DataType,
Expand All @@ -301,12 +324,22 @@ fn try_cast_literal_to_type(
if !is_support_data_type(&lit_data_type) || !is_support_data_type(target_type) {
return Ok(None);
}
if is_unsupported_cast(&lit_data_type, target_type) {
return Ok(None);
}
if lit_value.is_null() {
// null value can be cast to any type of null value
return Ok(Some(ScalarValue::try_from(target_type)?));
}
let mul = match target_type {
DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => 1_i128,
DataType::UInt8
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64
| DataType::Int8
| DataType::Int16
| DataType::Int32
| DataType::Int64 => 1_i128,
DataType::Timestamp(_, _) => 1_i128,
DataType::Decimal128(_, scale) => 10_i128.pow(*scale as u32),
other_type => {
Expand All @@ -317,6 +350,10 @@ fn try_cast_literal_to_type(
}
};
let (target_min, target_max) = match target_type {
DataType::UInt8 => (u8::MIN as i128, u8::MAX as i128),
DataType::UInt16 => (u16::MIN as i128, u16::MAX as i128),
DataType::UInt32 => (u32::MIN as i128, u32::MAX as i128),
DataType::UInt64 => (u64::MIN as i128, u64::MAX as i128),
DataType::Int8 => (i8::MIN as i128, i8::MAX as i128),
DataType::Int16 => (i16::MIN as i128, i16::MAX as i128),
DataType::Int32 => (i32::MIN as i128, i32::MAX as i128),
Expand All @@ -341,6 +378,10 @@ fn try_cast_literal_to_type(
ScalarValue::Int16(Some(v)) => (*v as i128).checked_mul(mul),
ScalarValue::Int32(Some(v)) => (*v as i128).checked_mul(mul),
ScalarValue::Int64(Some(v)) => (*v as i128).checked_mul(mul),
ScalarValue::UInt8(Some(v)) => (*v as i128).checked_mul(mul),
ScalarValue::UInt16(Some(v)) => (*v as i128).checked_mul(mul),
ScalarValue::UInt32(Some(v)) => (*v as i128).checked_mul(mul),
ScalarValue::UInt64(Some(v)) => (*v as i128).checked_mul(mul),
ScalarValue::TimestampSecond(Some(v), _) => (*v as i128).checked_mul(mul),
ScalarValue::TimestampMillisecond(Some(v), _) => (*v as i128).checked_mul(mul),
ScalarValue::TimestampMicrosecond(Some(v), _) => (*v as i128).checked_mul(mul),
Expand Down Expand Up @@ -383,6 +424,10 @@ fn try_cast_literal_to_type(
DataType::Int16 => ScalarValue::Int16(Some(value as i16)),
DataType::Int32 => ScalarValue::Int32(Some(value as i32)),
DataType::Int64 => ScalarValue::Int64(Some(value as i64)),
DataType::UInt8 => ScalarValue::UInt8(Some(value as u8)),
DataType::UInt16 => ScalarValue::UInt16(Some(value as u16)),
DataType::UInt32 => ScalarValue::UInt32(Some(value as u32)),
DataType::UInt64 => ScalarValue::UInt64(Some(value as u64)),
DataType::Timestamp(TimeUnit::Second, tz) => {
ScalarValue::TimestampSecond(Some(value as i64), tz.clone())
}
Expand Down Expand Up @@ -469,6 +514,15 @@ mod tests {
assert_eq!(optimize_test(lit_lt_lit, &schema), expected);
}

#[test]
fn test_unwrap_cast_comparison_unsigned() {
// "cast(c6, UINT64) = 0u64 => c6 = 0u32
let schema = expr_test_schema();
let expr_input = cast(col("c6"), DataType::UInt64).eq(lit(0u64));
let expected = col("c6").eq(lit(0u32));
assert_eq!(optimize_test(expr_input, &schema), expected);
}

#[test]
fn test_not_unwrap_cast_with_decimal_comparison() {
let schema = expr_test_schema();
Expand Down Expand Up @@ -635,16 +689,16 @@ mod tests {

#[test]
fn test_not_support_data_type() {
// "c6 > 0" will be cast to `cast(c6 as int64) > 0
// "c6 > 0" will be cast to `cast(c6 as float) > 0
// but the type of c6 is uint32
// the rewriter will not throw error and just return the original expr
let schema = expr_test_schema();
let expr_input = cast(col("c6"), DataType::Int64).eq(lit(0i64));
let expr_input = cast(col("c6"), DataType::Float64).eq(lit(0f64));
Copy link
Contributor

@liukun4515 liukun4515 Nov 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the comment of "c6 > 0" will be cast to `cast(c6 as int64) > 0 should be change to "c6 > 0"` will be cast to `cast(c6 as float64) > 0

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in c6946f0

assert_eq!(optimize_test(expr_input.clone(), &schema), expr_input);

// inlist for unsupported data type
let expr_input =
in_list(cast(col("c6"), DataType::Int64), vec![lit(0i64)], false);
in_list(cast(col("c6"), DataType::Float64), vec![lit(0f64)], false);
assert_eq!(optimize_test(expr_input.clone(), &schema), expr_input);
}

Expand Down Expand Up @@ -733,17 +787,24 @@ mod tests {
ScalarValue::Int16(None),
ScalarValue::Int32(None),
ScalarValue::Int64(None),
ScalarValue::UInt8(None),
ScalarValue::UInt16(None),
ScalarValue::UInt32(None),
ScalarValue::UInt64(None),
ScalarValue::Decimal128(None, 3, 0),
ScalarValue::Decimal128(None, 8, 2),
];

for s1 in &scalars {
for s2 in &scalars {
expect_cast(
s1.clone(),
s2.get_datatype(),
ExpectedCast::Value(s2.clone()),
);
let expected_value =
if is_unsupported_cast(&s1.get_datatype(), &s2.get_datatype()) {
ExpectedCast::NoValue
} else {
ExpectedCast::Value(s2.clone())
};

expect_cast(s1.clone(), s2.get_datatype(), expected_value);
}
}
}
Expand All @@ -756,25 +817,56 @@ mod tests {
ScalarValue::Int16(Some(123)),
ScalarValue::Int32(Some(123)),
ScalarValue::Int64(Some(123)),
ScalarValue::UInt8(Some(123)),
ScalarValue::UInt16(Some(123)),
ScalarValue::UInt32(Some(123)),
ScalarValue::UInt64(Some(123)),
ScalarValue::Decimal128(Some(123), 3, 0),
ScalarValue::Decimal128(Some(12300), 8, 2),
];

for s1 in &scalars {
for s2 in &scalars {
expect_cast(
s1.clone(),
s2.get_datatype(),
ExpectedCast::Value(s2.clone()),
);
let expected_value =
if is_unsupported_cast(&s1.get_datatype(), &s2.get_datatype()) {
ExpectedCast::NoValue
} else {
ExpectedCast::Value(s2.clone())
};

expect_cast(s1.clone(), s2.get_datatype(), expected_value);
}
}

let max_i32 = ScalarValue::Int32(Some(i32::MAX));
expect_cast(
max_i32,
DataType::UInt64,
ExpectedCast::Value(ScalarValue::UInt64(Some(i32::MAX as u64))),
);

let min_i32 = ScalarValue::Int32(Some(i32::MIN));
expect_cast(
min_i32,
DataType::Int64,
ExpectedCast::Value(ScalarValue::Int64(Some(i32::MIN as i64))),
);

let max_i64 = ScalarValue::Int64(Some(i64::MAX));
expect_cast(
max_i64,
DataType::UInt64,
ExpectedCast::Value(ScalarValue::UInt64(Some(i64::MAX as u64))),
);
}

#[test]
fn test_try_cast_to_type_int_out_of_range() {
let min_i32 = ScalarValue::Int32(Some(i32::MIN));
let min_i64 = ScalarValue::Int64(Some(i64::MIN));
let max_i64 = ScalarValue::Int64(Some(i64::MAX));
let max_u64 = ScalarValue::UInt64(Some(u64::MAX));

expect_cast(max_i64.clone(), DataType::Int8, ExpectedCast::NoValue);

expect_cast(max_i64.clone(), DataType::Int16, ExpectedCast::NoValue);
Expand All @@ -783,6 +875,10 @@ mod tests {

expect_cast(max_u64, DataType::Int64, ExpectedCast::NoValue);

expect_cast(min_i64, DataType::UInt64, ExpectedCast::NoValue);

expect_cast(min_i32, DataType::UInt64, ExpectedCast::NoValue);

// decimal out of range
expect_cast(
ScalarValue::Decimal128(Some(99999999999999999999999999999999999900), 38, 0),
Expand Down
6 changes: 3 additions & 3 deletions datafusion/optimizer/tests/integration-test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ fn case_when() -> Result<()> {

let sql = "SELECT CASE WHEN col_uint32 > 0 THEN 1 ELSE 0 END FROM test";
let plan = test_sql(sql)?;
let expected = "Projection: CASE WHEN CAST(test.col_uint32 AS Int64) > Int64(0) THEN Int64(1) ELSE Int64(0) END\
\n TableScan: test projection=[col_uint32]";
let expected = "Projection: CASE WHEN test.col_uint32 > UInt32(0) THEN Int64(1) ELSE Int64(0) END AS CASE WHEN test.col_uint32 > Int64(0) THEN Int64(1) ELSE Int64(0) END\
\n TableScan: test projection=[col_uint32]";
assert_eq!(expected, format!("{:?}", plan));
Ok(())
}
Expand Down Expand Up @@ -91,7 +91,7 @@ fn unsigned_target_type() -> Result<()> {
let sql = "SELECT col_utf8 FROM test WHERE col_uint32 > 0";
let plan = test_sql(sql)?;
let expected = "Projection: test.col_utf8\
\n Filter: CAST(test.col_uint32 AS Int64) > Int64(0)\
\n Filter: test.col_uint32 > UInt32(0)\
\n TableScan: test projection=[col_uint32, col_utf8]";
assert_eq!(expected, format!("{:?}", plan));
Ok(())
Expand Down