Skip to content

Optimizer: type coercion difference in schemas for LargeUtf8 input on UDF accepting coercible binary #18746

@Jefffrey

Description

@Jefffrey

Describe the bug

Full reproduction:

use std::any::Any;

use arrow::datatypes::DataType;
use datafusion::common::types::{logical_binary, logical_string, NativeType};
use datafusion::common::ScalarValue;
use datafusion::error::Result;
use datafusion::logical_expr::{Coercion, TypeSignatureClass, Volatility};
use datafusion::logical_expr::{
    ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
};
use datafusion::prelude::*;

#[derive(Debug, PartialEq, Eq, Hash)]
struct ExampleUdf {
    signature: Signature,
}

impl ExampleUdf {
    fn new() -> Self {
        Self {
            signature: Signature::coercible(
                vec![Coercion::new_implicit(
                    TypeSignatureClass::Native(logical_binary()),
                    vec![TypeSignatureClass::Native(logical_string())],
                    NativeType::Binary,
                )],
                Volatility::Immutable,
            ),
        }
    }
}

impl ScalarUDFImpl for ExampleUdf {
    fn as_any(&self) -> &dyn Any {
        self
    }

    fn name(&self) -> &str {
        "example_udf"
    }

    fn signature(&self) -> &Signature {
        &self.signature
    }

    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
        match &arg_types[0] {
            DataType::Binary | DataType::BinaryView => Ok(DataType::Utf8),
            DataType::LargeBinary => Ok(DataType::LargeUtf8),
            _ => unreachable!(),
        }
    }

    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
        match &args.args[0].data_type() {
            DataType::Binary | DataType::BinaryView => Ok(ColumnarValue::Scalar(
                ScalarValue::Utf8(Some("a".to_string())),
            )),
            DataType::LargeBinary => Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(
                Some("a".to_string()),
            ))),
            _ => unreachable!(),
        }
    }
}

#[tokio::main]
async fn main() -> Result<()> {
    let ctx = SessionContext::new();
    ctx.register_udf(ScalarUDF::from(ExampleUdf::new()));
    let sqls = [
        // all of these are fine
        "select example_udf(arrow_cast('a', 'Binary'))",
        "select example_udf(arrow_cast('a', 'BinaryView'))",
        "select example_udf(arrow_cast('a', 'LargeBinary'))",
        "select example_udf(arrow_cast('a', 'Utf8'))",
        "select example_udf(arrow_cast('a', 'Utf8View'))",
        // this one has error
        "select example_udf(arrow_cast('a', 'LargeUtf8'))",
    ];
    for sql in sqls {
        ctx.sql(sql).await?.show().await?;
    }
    Ok(())
}

Here we have a UDF which accepts string & binary inputs, where strings are coerced to binary. Can see in the example SQLs we test each type that should be accepted. The first five SQL are fine, however the last one fails with error:

Error: Context("Optimizer rule 'optimize_projections' failed", Context("Check optimizer-specific invariants after optimizer rule: optimize_projections", Internal("Failed due to a difference in schemas: original schema: DFSchema { inner: Schema { fields: [Field { name: \"example_udf(arrow_cast(Utf8(\\\"a\\\"),Utf8(\\\"LargeUtf8\\\")))\", data_type: Utf8, nullable: true }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }, new schema: DFSchema { inner: Schema { fields: [Field { name: \"example_udf(arrow_cast(Utf8(\\\"a\\\"),Utf8(\\\"LargeUtf8\\\")))\", data_type: LargeUtf8 }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }")))

Formatted for readability:

Failed due to a difference in schemas:
original schema: DFSchema { inner: Schema { fields: [Field { name: "example_udf(arrow_cast(Utf8("a"),Utf8("LargeUtf8")))", data_type: Utf8, nullable: true }] } },
     new schema: DFSchema { inner: Schema { fields: [Field { name: "example_udf(arrow_cast(Utf8("a"),Utf8("LargeUtf8")))", data_type: LargeUtf8 }] } }"
  • Remove unrelated info and extracted the difference in schema
  • Note the error is from optimize_projections but I believe this is only because this is where it checks the schema; error stems from type_coercion rule
  • See how the data type is difference as well as nullability

There seems to be something going on in the type coercion analyzer rule that incorrectly changes the project schema from LargeUtf8 to Utf8?

To Reproduce

No response

Expected behavior

No response

Additional context

No response

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions