-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
Full reproduction:
use std::any::Any;
use arrow::datatypes::DataType;
use datafusion::common::types::{logical_binary, logical_string, NativeType};
use datafusion::common::ScalarValue;
use datafusion::error::Result;
use datafusion::logical_expr::{Coercion, TypeSignatureClass, Volatility};
use datafusion::logical_expr::{
ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
};
use datafusion::prelude::*;
#[derive(Debug, PartialEq, Eq, Hash)]
struct ExampleUdf {
signature: Signature,
}
impl ExampleUdf {
fn new() -> Self {
Self {
signature: Signature::coercible(
vec![Coercion::new_implicit(
TypeSignatureClass::Native(logical_binary()),
vec![TypeSignatureClass::Native(logical_string())],
NativeType::Binary,
)],
Volatility::Immutable,
),
}
}
}
impl ScalarUDFImpl for ExampleUdf {
fn as_any(&self) -> &dyn Any {
self
}
fn name(&self) -> &str {
"example_udf"
}
fn signature(&self) -> &Signature {
&self.signature
}
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
match &arg_types[0] {
DataType::Binary | DataType::BinaryView => Ok(DataType::Utf8),
DataType::LargeBinary => Ok(DataType::LargeUtf8),
_ => unreachable!(),
}
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
match &args.args[0].data_type() {
DataType::Binary | DataType::BinaryView => Ok(ColumnarValue::Scalar(
ScalarValue::Utf8(Some("a".to_string())),
)),
DataType::LargeBinary => Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(
Some("a".to_string()),
))),
_ => unreachable!(),
}
}
}
#[tokio::main]
async fn main() -> Result<()> {
let ctx = SessionContext::new();
ctx.register_udf(ScalarUDF::from(ExampleUdf::new()));
let sqls = [
// all of these are fine
"select example_udf(arrow_cast('a', 'Binary'))",
"select example_udf(arrow_cast('a', 'BinaryView'))",
"select example_udf(arrow_cast('a', 'LargeBinary'))",
"select example_udf(arrow_cast('a', 'Utf8'))",
"select example_udf(arrow_cast('a', 'Utf8View'))",
// this one has error
"select example_udf(arrow_cast('a', 'LargeUtf8'))",
];
for sql in sqls {
ctx.sql(sql).await?.show().await?;
}
Ok(())
}Here we have a UDF which accepts string & binary inputs, where strings are coerced to binary. Can see in the example SQLs we test each type that should be accepted. The first five SQL are fine, however the last one fails with error:
Error: Context("Optimizer rule 'optimize_projections' failed", Context("Check optimizer-specific invariants after optimizer rule: optimize_projections", Internal("Failed due to a difference in schemas: original schema: DFSchema { inner: Schema { fields: [Field { name: \"example_udf(arrow_cast(Utf8(\\\"a\\\"),Utf8(\\\"LargeUtf8\\\")))\", data_type: Utf8, nullable: true }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }, new schema: DFSchema { inner: Schema { fields: [Field { name: \"example_udf(arrow_cast(Utf8(\\\"a\\\"),Utf8(\\\"LargeUtf8\\\")))\", data_type: LargeUtf8 }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }")))
Formatted for readability:
Failed due to a difference in schemas:
original schema: DFSchema { inner: Schema { fields: [Field { name: "example_udf(arrow_cast(Utf8("a"),Utf8("LargeUtf8")))", data_type: Utf8, nullable: true }] } },
new schema: DFSchema { inner: Schema { fields: [Field { name: "example_udf(arrow_cast(Utf8("a"),Utf8("LargeUtf8")))", data_type: LargeUtf8 }] } }"
- Remove unrelated info and extracted the difference in schema
- Note the error is from
optimize_projectionsbut I believe this is only because this is where it checks the schema; error stems from type_coercion rule - See how the data type is difference as well as nullability
There seems to be something going on in the type coercion analyzer rule that incorrectly changes the project schema from LargeUtf8 to Utf8?
To Reproduce
No response
Expected behavior
No response
Additional context
No response
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working