diff --git a/.gitattributes b/.gitattributes index 84b47a6fc56e1..918d83a00a9e5 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2,3 +2,4 @@ datafusion/core/tests/data/newlines_in_values.csv text eol=lf datafusion/proto/src/generated/prost.rs linguist-generated datafusion/proto/src/generated/pbjson.rs linguist-generated +datafusion/substrait/extensions/functions_datafusion.yaml linguist-generated diff --git a/Cargo.lock b/Cargo.lock index 66aef04c92394..db25839e148db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2656,6 +2656,7 @@ dependencies = [ "pbjson-types", "prost", "serde_json", + "serde_yaml", "substrait", "tokio", "url", diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index a0f203cec8db6..2f09a4db1bda9 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -44,8 +44,10 @@ object_store = { workspace = true } # We need to match the version in substrait, so we don't use the workspace version here pbjson-types = { version = "0.8.0" } prost = { workspace = true } +serde_json = { workspace = true } substrait = { version = "0.63.0", features = ["serde"] } url = { workspace = true } +serde_yaml = "0.9.34" tokio = { workspace = true, features = ["fs"] } [dev-dependencies] diff --git a/datafusion/substrait/extensions/functions_datafusion.yaml b/datafusion/substrait/extensions/functions_datafusion.yaml new file mode 100644 index 0000000000000..969f728425f79 --- /dev/null +++ b/datafusion/substrait/extensions/functions_datafusion.yaml @@ -0,0 +1,3742 @@ +aggregate_functions: +- description: Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm. + impls: + - args: + - name: expression + value: any + decomposable: MANY + deterministic: true + intermediate: i64 + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: approx_distinct +- description: Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY x)`. + impls: + - args: + - name: expression + value: float + decomposable: NONE + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: approx_median +- description: Returns the approximate percentile of input values using the t-digest algorithm. + impls: + - args: + - name: expression + value: float + - name: percentile + value: fp64 + decomposable: NONE + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + - args: + - name: expression + value: float + - name: percentile + value: fp64 + - name: centroids + value: integer + decomposable: NONE + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: approx_percentile_cont +- description: Returns the weighted approximate percentile of input values using the t-digest algorithm. + impls: + - args: + - name: expression + value: float + - name: weight + value: float + - name: percentile + value: fp64 + decomposable: NONE + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + - args: + - name: expression + value: float + - name: weight + value: float + - name: percentile + value: fp64 + - name: centroids + value: integer + decomposable: NONE + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: approx_percentile_cont_with_weight +- description: |- + Returns an array created from the expression elements. If ordering is required, elements are inserted in the specified order. + This aggregation function can only mix DISTINCT and ORDER BY if the ordering expression is exactly the same as the argument expression. + impls: + - args: + - name: expression + value: any + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: array_agg +- description: Returns the average of numeric values in the specified column. + impls: + - args: + - name: expression + value: decimal + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + - args: + - name: expression + value: duration + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + - args: + - name: expression + value: fp64 + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + metadata: + datafusion: + aliases: + - mean + name: avg +- description: Computes the bitwise AND of all non-null input values. + impls: + - args: + - name: expression + value: integer + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: bit_and +- description: Computes the bitwise OR of all non-null input values. + impls: + - args: + - name: expression + value: integer + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: bit_or +- description: Computes the bitwise exclusive OR of all non-null input values. + impls: + - args: + - name: expression + value: integer + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: bit_xor +- description: Returns true if all non-null input values are true, otherwise false. + impls: + - args: + - name: expression + value: boolean + decomposable: MANY + deterministic: true + intermediate: boolean + nullability: MIRROR + ordered: false + return: boolean + sessionDependent: false + name: bool_and +- description: Returns true if all non-null input values are true, otherwise false. + impls: + - args: + - name: expression + value: boolean + decomposable: MANY + deterministic: true + intermediate: boolean + nullability: MIRROR + ordered: false + return: boolean + sessionDependent: false + name: bool_or +- description: Returns the coefficient of correlation between two numeric values. + impls: + - args: + - name: y + value: fp64 + - name: x + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + name: corr +- description: Returns the number of non-null values in the specified column. To include null values in the total count, use `count(*)`. + impls: + - args: + - name: expression + value: any + decomposable: MANY + deterministic: true + intermediate: i64 + nullability: DECLARED_OUTPUT + ordered: false + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + - decomposable: MANY + deterministic: true + intermediate: i64 + nullability: DECLARED_OUTPUT + ordered: false + return: i64 + sessionDependent: false + name: count +- description: Returns the sample covariance of a set of number pairs. + impls: + - args: + - name: expression1 + value: fp64 + - name: expression2 + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + name: covar_pop +- description: Returns the sample covariance of a set of number pairs. + impls: + - args: + - name: expression1 + value: fp64 + - name: expression2 + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + metadata: + datafusion: + aliases: + - covar + name: covar_samp +- description: Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group. + impls: + - args: + - name: expression + value: any + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: first_value +- description: Returns 1 if the data is aggregated across the specified column, or 0 if it is not aggregated in the result set. + impls: + - args: + - name: expression + value: any + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + name: grouping +- description: Returns the last element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group. + impls: + - args: + - name: expression + value: any + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: last_value +- description: Returns the maximum value in the specified column. + impls: + - args: + - name: expression + value: any1 + decomposable: MANY + deterministic: true + intermediate: any1 + nullability: MIRROR + ordered: false + return: any1 + sessionDependent: false + name: max +- description: Returns the median value in the specified column. + impls: + - args: + - name: expression + value: decimal + decomposable: NONE + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + - args: + - name: expression + value: float + decomposable: NONE + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + - args: + - name: expression + value: fp64 + decomposable: NONE + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: median +- description: Returns the minimum value in the specified column. + impls: + - args: + - name: expression + value: any1 + decomposable: MANY + deterministic: true + intermediate: any1 + nullability: MIRROR + ordered: false + return: any1 + sessionDependent: false + name: min +- description: Returns the nth value in a group of values. + impls: + - args: + - name: expression + value: any + - name: n + value: any + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: nth_value +- description: Returns the exact percentile of input values, interpolating between values if needed. + impls: + - args: + - name: expr + value: float + - name: percentile + value: fp64 + decomposable: NONE + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + metadata: + datafusion: + aliases: + - quantile_cont + name: percentile_cont +- description: Computes the average of the independent variable (input) expression_x for the non-null paired data points. + impls: + - args: + - name: expression_y + value: fp64 + - name: expression_x + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + name: regr_avgx +- description: Computes the average of the dependent variable (output) expression_y for the non-null paired data points. + impls: + - args: + - name: expression_y + value: fp64 + - name: expression_x + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + name: regr_avgy +- description: Counts the number of non-null paired data points. + impls: + - args: + - name: expression_y + value: fp64 + - name: expression_x + value: fp64 + decomposable: MANY + deterministic: true + intermediate: i64 + nullability: MIRROR + ordered: false + return: u64 + sessionDependent: false + name: regr_count +- description: Computes the y-intercept of the linear regression line. For the equation (y = kx + b), this function returns b. + impls: + - args: + - name: expression_y + value: fp64 + - name: expression_x + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + name: regr_intercept +- description: Computes the square of the correlation coefficient between the independent and dependent variables. + impls: + - args: + - name: expression_y + value: fp64 + - name: expression_x + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + name: regr_r2 +- description: 'Returns the slope of the linear regression line for non-null pairs in aggregate columns. Given input column Y and X: regr_slope(Y, X) returns the slope (k in Y = k*X + b) using minimal RSS fitting.' + impls: + - args: + - name: expression_y + value: fp64 + - name: expression_x + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + name: regr_slope +- description: Computes the sum of squares of the independent variable. + impls: + - args: + - name: expression_y + value: fp64 + - name: expression_x + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + name: regr_sxx +- description: Computes the sum of products of paired data points. + impls: + - args: + - name: expression_y + value: fp64 + - name: expression_x + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + name: regr_sxy +- description: Computes the sum of squares of the dependent variable. + impls: + - args: + - name: expression_y + value: fp64 + - name: expression_x + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + name: regr_syy +- description: Returns the standard deviation of a set of numbers. + impls: + - args: + - name: expression + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + metadata: + datafusion: + aliases: + - stddev_samp + name: stddev +- description: Returns the population standard deviation of a set of numbers. + impls: + - args: + - name: expression + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + name: stddev_pop +- description: Concatenates the values of string expressions and places separator values between them. If ordering is required, strings are concatenated in the specified order. This aggregation function can only mix DISTINCT and ORDER BY if the ordering expression is exactly the same as the first argument expression. + impls: + - args: + - name: expression + value: string + - name: delimiter + value: string + decomposable: MANY + deterministic: true + intermediate: string + nullability: MIRROR + ordered: false + return: string + sessionDependent: false + - args: + - name: expression + value: string + - name: delimiter + value: any + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: string_agg +- description: Returns the sum of all values in the specified column. + impls: + - args: + - name: expression + value: decimal + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + - args: + - name: expression + value: u64 + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + - args: + - name: expression + value: i64 + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + - args: + - name: expression + value: fp64 + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + - args: + - name: expression + value: duration + decomposable: MANY + deterministic: true + intermediate: any + nullability: MIRROR + ordered: false + return: any + sessionDependent: false + name: sum +- description: Returns the statistical sample variance of a set of numbers. + impls: + - args: + - name: expression + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + metadata: + datafusion: + aliases: + - var_samp + - var_sample + name: var +- description: Returns the statistical population variance of a set of numbers. + impls: + - args: + - name: expression + value: fp64 + decomposable: MANY + deterministic: true + intermediate: fp64 + nullability: MIRROR + ordered: false + return: fp64 + sessionDependent: false + metadata: + datafusion: + aliases: + - var_population + name: var_pop +scalar_functions: +- description: Returns the absolute value of a number. + impls: + - args: + - name: numeric_expression + value: i8 + deterministic: true + nullability: MIRROR + return: i8 + sessionDependent: false + - args: + - name: numeric_expression + value: i16 + deterministic: true + nullability: MIRROR + return: i16 + sessionDependent: false + - args: + - name: numeric_expression + value: i32 + deterministic: true + nullability: MIRROR + return: i32 + sessionDependent: false + - args: + - name: numeric_expression + value: i64 + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + - args: + - name: numeric_expression + value: u8 + deterministic: true + nullability: MIRROR + return: u8 + sessionDependent: false + - args: + - name: numeric_expression + value: u16 + deterministic: true + nullability: MIRROR + return: u16 + sessionDependent: false + - args: + - name: numeric_expression + value: u32 + deterministic: true + nullability: MIRROR + return: u32 + sessionDependent: false + - args: + - name: numeric_expression + value: u64 + deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + name: abs +- description: Returns the arc cosine or inverse cosine of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: acos +- description: Returns the area hyperbolic cosine or inverse hyperbolic cosine of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: acosh +- description: Casts a value to a specific Arrow data type. + impls: + - args: + - name: expression + value: any + - name: datatype + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: arrow_cast +- description: Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata. + impls: + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: arrow_field +- description: Returns the metadata of the input expression. If a key is provided, returns the value for that key. If no key is provided, returns a Map of all metadata. + impls: + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: expression + value: any + - name: key + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: arrow_metadata +- description: Casts a value to a specific Arrow data type, returning NULL if the cast fails. + impls: + - args: + - name: expression + value: any + - name: datatype + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: arrow_try_cast +- description: Returns the name of the underlying [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the expression. + impls: + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: arrow_typeof +- description: Returns the first Unicode scalar value of a string. + impls: + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: ascii +- description: Returns the arc sine or inverse sine of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: asin +- description: Returns the area hyperbolic sine or inverse hyperbolic sine of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: asinh +- description: Returns the arc tangent or inverse tangent of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: atan +- description: Returns the arc tangent or inverse tangent of `expression_y / expression_x`. + impls: + - args: + - name: expression_y + value: fp32 + - name: expression_x + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + - args: + - name: expression_y + value: fp64 + - name: expression_x + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + name: atan2 +- description: Returns the area hyperbolic tangent or inverse hyperbolic tangent of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: atanh +- description: Returns the bit length of a string. + impls: + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: bit_length +- description: Trims the specified trim string from the start and end of a string. If no trim string is provided, all spaces are removed from the start and end of the input string. + impls: + - args: + - name: str + value: string + - name: trim_str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + metadata: + datafusion: + aliases: + - trim + name: btrim +- description: Casts the first argument to the data type of the second argument. Only the type of the second argument is used; its value is ignored. + impls: + - args: + - name: expression + value: any + - name: reference + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: cast_to_type +- description: Returns the cube root of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: cbrt +- description: Returns the nearest integer greater than or equal to a number. + impls: + - args: + - name: numeric_expression + value: decimal + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: ceil +- description: Returns the number of characters in a string. + impls: + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: i32 + sessionDependent: false + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + metadata: + datafusion: + aliases: + - char_length + - length + name: character_length +- description: Returns a string containing the character with the specified Unicode scalar value. + impls: + - args: + - name: expression + value: i64 + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + name: chr +- description: Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values. + impls: + - args: + - value: any1 + deterministic: true + nullability: MIRROR + return: any1 + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: CONSISTENT + name: coalesce +- description: Concatenates multiple strings together. + impls: + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: CONSISTENT + - args: + - name: str + value: binary + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: CONSISTENT + name: concat +- description: Concatenates multiple strings together with a specified separator. + impls: + - args: + - name: separator + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: CONSISTENT + name: concat_ws +- description: Return true if search_str is found within string (case-sensitive). + impls: + - args: + - name: str + value: string + - name: search_str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: contains +- description: Returns the cosine of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: cos +- description: Returns the hyperbolic cosine of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: cosh +- description: Returns the cotangent of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: cot +- description: |2 + + Returns the current date in the session time zone. + + The `current_date()` return value is determined at query time and will return the same date, no matter when in the query plan the function executes. + impls: + - deterministic: true + nullability: MIRROR + return: date + sessionDependent: true + metadata: + datafusion: + aliases: + - today + name: current_date +- description: |2 + + Returns the current time in the session time zone. + + The `current_time()` return value is determined at query time and will return the same time, no matter when in the query plan the function executes. + + The session time zone can be set using the statement 'SET datafusion.execution.time_zone = desired time zone'. The time zone can be a value like +00:00, 'Europe/London' etc. + impls: + - deterministic: true + nullability: MIRROR + return: time + sessionDependent: true + name: current_time +- description: |2 + + Calculates time intervals and returns the start of the interval nearest to the specified timestamp. Use `date_bin` to downsample time series data by grouping rows into time-based "bins" or "windows" and applying an aggregate or selector function to each window. + + For example, if you "bin" or "window" data into 15 minute intervals, an input timestamp of `2023-01-01T18:18:18Z` will be updated to the start time of the 15 minute bin it is in: `2023-01-01T18:15:00Z`. + impls: + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp + - name: origin-timestamp + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp_tz + - name: origin-timestamp + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp + - name: origin-timestamp + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp_tz + - name: origin-timestamp + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: time + - name: origin-timestamp + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: time + - name: origin-timestamp + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp + - name: origin-timestamp + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp_tz + - name: origin-timestamp + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp + - name: origin-timestamp + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp_tz + - name: origin-timestamp + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: time + - name: origin-timestamp + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: time + - name: origin-timestamp + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp + - name: origin-timestamp + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp_tz + - name: origin-timestamp + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp + - name: origin-timestamp + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp_tz + - name: origin-timestamp + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: time + - name: origin-timestamp + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: time + - name: origin-timestamp + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp + - name: origin-timestamp + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp_tz + - name: origin-timestamp + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp + - name: origin-timestamp + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp_tz + - name: origin-timestamp + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: timestamp_tz + deterministic: true + nullability: MIRROR + return: timestamp_tz + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: time + - name: origin-timestamp + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_month_day_nano + - name: expression + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: time + - name: origin-timestamp + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + - args: + - name: interval + value: interval_day_time + - name: expression + value: time + deterministic: true + nullability: MIRROR + return: time + sessionDependent: false + name: date_bin +- description: Returns the specified part of the date as an integer. + impls: + - args: + - name: part + value: string + - name: expression + value: timestamp + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: part + value: string + - name: expression + value: date + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: part + value: string + - name: expression + value: time + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: part + value: string + - name: expression + value: interval + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: part + value: string + - name: expression + value: duration + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + metadata: + datafusion: + aliases: + - datepart + name: date_part +- description: Truncates a timestamp or time value to a specified precision. + impls: + - args: + - name: precision + value: string + - name: expression + value: timestamp + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: precision + value: string + - name: expression + value: time + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + metadata: + datafusion: + aliases: + - datetrunc + name: date_trunc +- description: Decode binary data from textual representation in string. + impls: + - args: + - name: expression + value: binary + - name: format + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: decode +- description: Converts radians to degrees. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: degrees +- description: Encode binary data into a textual representation. + impls: + - args: + - name: expression + value: binary + - name: format + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: encode +- description: Tests if a string ends with a substring. + impls: + - args: + - name: str + value: string + - name: substr + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: ends_with +- description: Returns the base-e exponential of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: exp +- description: Factorial of a non-negative integer. Errors if the argument is negative or the result overflows. + impls: + - args: + - name: numeric_expression + value: i64 + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + name: factorial +- description: Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings. + impls: + - args: + - name: str + value: string + - name: strlist + value: string + deterministic: true + nullability: MIRROR + return: i32 + sessionDependent: false + - args: + - name: str + value: string + - name: strlist + value: string + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + name: find_in_set +- description: Returns the nearest integer less than or equal to a number. + impls: + - args: + - name: numeric_expression + value: decimal + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: floor +- description: Converts an integer to RFC3339 timestamp format (`YYYY-MM-DDT00:00:00.000000000Z`). Integers and unsigned integers are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`) return the corresponding timestamp. + impls: + - args: + - name: expression + value: i64 + - name: timezone + value: string + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + - args: + - name: expression + value: i64 + deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: false + name: from_unixtime +- description: Returns the greatest common divisor of `expression_x` and `expression_y`. Returns 0 if both inputs are zero. + impls: + - args: + - name: expression_x + value: i64 + - name: expression_y + value: i64 + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + name: gcd +- description: |- + Returns a field within a map or a struct with the given key. + Supports nested field access by providing multiple field names. + Note: most users invoke `get_field` indirectly via field access + syntax such as `my_struct_col['field_name']` which results in a call to + `get_field(my_struct_col, 'field_name')`. + Nested access like `my_struct['a']['b']` is optimized to a single call: + `get_field(my_struct, 'a', 'b')`. + impls: + - args: + - name: expression + value: any + - name: field_name + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: CONSISTENT + name: get_field +- description: Returns the greatest value in a list of expressions. Returns _null_ if all expressions are _null_. + impls: + - args: + - value: any1 + deterministic: true + nullability: MIRROR + return: any1 + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: CONSISTENT + name: greatest +- description: Capitalizes the first character in each word in the input string. Words are delimited by non-alphanumeric characters. + impls: + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: initcap +- description: Returns true if a given number is +NaN or -NaN otherwise returns false. + impls: + - args: + - name: numeric_expression + value: numeric + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: isnan +- description: Returns true if a given number is +0.0 or -0.0 otherwise returns false. + impls: + - args: + - name: numeric_expression + value: numeric + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: iszero +- description: Returns the least common multiple of `expression_x` and `expression_y`. Returns 0 if either input is zero. + impls: + - args: + - name: expression_x + value: i64 + - name: expression_y + value: i64 + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + name: lcm +- description: Returns the smallest value in a list of expressions. Returns _null_ if all expressions are _null_. + impls: + - args: + - value: any1 + deterministic: true + nullability: MIRROR + return: any1 + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: CONSISTENT + name: least +- description: Returns a specified number of characters from the left side of a string. + impls: + - args: + - name: str + value: string + - name: n + value: i64 + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + name: left +- description: Returns the [`Levenshtein distance`](https://en.wikipedia.org/wiki/Levenshtein_distance) between the two given strings. + impls: + - args: + - name: str1 + value: string + - name: str2 + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: levenshtein +- description: Returns the natural logarithm of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: ln +- description: Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number. + impls: + - args: + - name: base + value: decimal + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: base + value: float + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: base + value: float + - name: numeric_expression + value: decimal + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: base + value: float + - name: numeric_expression + value: float + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: log +- description: Returns the base-10 logarithm of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: log10 +- description: Returns the base-2 logarithm of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: log2 +- description: Converts a string to lower-case. + impls: + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: lower +- description: Pads the left side of a string with another string to a specified string length. + impls: + - args: + - name: str + value: string + - name: n + value: i64 + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: str + value: string + - name: n + value: i64 + - name: padding_str + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + name: lpad +- description: Trims the specified trim string from the beginning of a string. If no trim string is provided, spaces are removed from the start of the input string. + impls: + - args: + - name: str + value: string + - name: trim_str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: ltrim +- description: Make a date from year/month/day component parts. + impls: + - args: + - name: year + value: i32 + - name: month + value: i32 + - name: day + value: i32 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: make_date +- description: Make a time from hour/minute/second component parts. + impls: + - args: + - name: hour + value: i32 + - name: minute + value: i32 + - name: second + value: i32 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: make_time +- description: |- + Returns an Arrow struct using the specified name and input expressions pairs. + For information on comparing and ordering struct values (including `NULL` handling), + see [Comparison and Ordering](struct_coercion.md#comparison-and-ordering). + impls: + - args: + - name: expression_n_name + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + name: named_struct +- description: |- + Returns the first argument if it's not _NaN_. + Returns the second argument otherwise. + impls: + - args: + - name: expression_x + value: fp16 + - name: expression_y + value: fp16 + deterministic: true + nullability: MIRROR + return: fp16 + sessionDependent: false + - args: + - name: expression_x + value: fp32 + - name: expression_y + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + - args: + - name: expression_x + value: fp64 + - name: expression_y + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + name: nanvl +- description: |2 + + Returns the current timestamp in the system configured timezone (None by default). + + The `now()` return value is determined at query time and will return the same timestamp, no matter when in the query plan the function executes. + impls: + - deterministic: true + nullability: MIRROR + return: timestamp + sessionDependent: true + metadata: + datafusion: + aliases: + - current_timestamp + name: now +- description: |- + Returns _null_ if _expression1_ equals _expression2_; otherwise it returns _expression1_. + This can be used to perform the inverse operation of [`coalesce`](#coalesce). + impls: + - args: + - name: expression1 + value: any1 + - name: expression2 + value: any1 + deterministic: true + nullability: MIRROR + return: any1 + sessionDependent: false + name: nullif +- description: Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_ and _expression2_ is not evaluated. This function can be used to substitute a default value for NULL values. + impls: + - args: + - name: expression1 + value: boolean + - name: expression2 + value: boolean + deterministic: true + nullability: MIRROR + return: boolean + sessionDependent: false + - args: + - name: expression1 + value: u8 + - name: expression2 + value: u8 + deterministic: true + nullability: MIRROR + return: u8 + sessionDependent: false + - args: + - name: expression1 + value: u16 + - name: expression2 + value: u16 + deterministic: true + nullability: MIRROR + return: u16 + sessionDependent: false + - args: + - name: expression1 + value: u32 + - name: expression2 + value: u32 + deterministic: true + nullability: MIRROR + return: u32 + sessionDependent: false + - args: + - name: expression1 + value: u64 + - name: expression2 + value: u64 + deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + - args: + - name: expression1 + value: i8 + - name: expression2 + value: i8 + deterministic: true + nullability: MIRROR + return: i8 + sessionDependent: false + - args: + - name: expression1 + value: i16 + - name: expression2 + value: i16 + deterministic: true + nullability: MIRROR + return: i16 + sessionDependent: false + - args: + - name: expression1 + value: i32 + - name: expression2 + value: i32 + deterministic: true + nullability: MIRROR + return: i32 + sessionDependent: false + - args: + - name: expression1 + value: i64 + - name: expression2 + value: i64 + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + - args: + - name: expression1 + value: fp32 + - name: expression2 + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + - args: + - name: expression1 + value: fp64 + - name: expression2 + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: expression1 + value: string + - name: expression2 + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + metadata: + datafusion: + aliases: + - ifnull + name: nvl +- description: Returns _expression2_ if _expression1_ is not NULL; otherwise it returns _expression3_. + impls: + - args: + - name: expression1 + value: any + - name: expression2 + value: any1 + - name: expression3 + value: any1 + deterministic: true + nullability: MIRROR + return: any1 + sessionDependent: false + name: nvl2 +- description: Returns the length of a string in bytes. + impls: + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: octet_length +- description: Returns the string which is replaced by another string from the specified position and specified count length. + impls: + - args: + - name: str + value: string + - name: substr + value: string + - name: pos + value: i64 + - name: count + value: i64 + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: str + value: string + - name: substr + value: string + - name: pos + value: i64 + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + name: overlay +- description: Returns an approximate value of π. + impls: + - deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + name: pi +- description: Returns a base expression raised to the power of an exponent. + impls: + - args: + - name: base + value: decimal + - name: exponent + value: i64 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: base + value: decimal + - name: exponent + value: fp64 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: base + value: fp64 + - name: exponent + value: fp64 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + metadata: + datafusion: + aliases: + - pow + name: power +- description: Converts degrees to radians. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: radians +- description: |- + Returns a random float value in the range [0, 1). + The random seed is unique to each row. + impls: + - deterministic: false + nullability: MIRROR + return: fp64 + sessionDependent: false + metadata: + datafusion: + aliases: + - rand + name: random +- description: Returns the number of matches that a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has in a string. + impls: + - args: + - name: str + value: string + - name: regexp + value: string + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + - args: + - name: str + value: string + - name: regexp + value: string + - name: start + value: i64 + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + - args: + - name: str + value: string + - name: regexp + value: string + - name: start + value: i64 + - name: flags + value: string + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + name: regexp_count +- description: Returns the position in a string where the specified occurrence of a POSIX regular expression is located. + impls: + - args: + - name: str + value: string + - name: regexp + value: string + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + - args: + - name: str + value: string + - name: regexp + value: string + - name: start + value: i64 + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + - args: + - name: str + value: string + - name: regexp + value: string + - name: start + value: i64 + - name: N + value: i64 + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + - args: + - name: str + value: string + - name: regexp + value: string + - name: start + value: i64 + - name: N + value: i64 + - name: flags + value: string + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + - args: + - name: str + value: string + - name: regexp + value: string + - name: start + value: i64 + - name: N + value: i64 + - name: flags + value: string + - name: subexpr + value: i64 + deterministic: true + nullability: MIRROR + return: i64 + sessionDependent: false + name: regexp_instr +- description: Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise. + impls: + - args: + - name: str + value: string + - name: regexp + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: str + value: string + - name: regexp + value: string + - name: flags + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: regexp_like +- description: Returns the first [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string. + impls: + - args: + - name: str + value: string + - name: regexp + value: string + deterministic: true + nullability: MIRROR + return: list + sessionDependent: false + - args: + - name: str + value: string + - name: regexp + value: string + - name: flags + value: string + deterministic: true + nullability: MIRROR + return: list + sessionDependent: false + name: regexp_match +- description: Replaces substrings in a string that match a [regular expression](https://docs.rs/regex/latest/regex/#syntax). + impls: + - args: + - name: str + value: string + - name: regexp + value: string + - name: replacement + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: str + value: string + - name: regexp + value: string + - name: replacement + value: string + - name: flags + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + name: regexp_replace +- description: Returns a string with an input string repeated a specified number. + impls: + - args: + - name: str + value: string + - name: n + value: i64 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: repeat +- description: Replaces all occurrences of a specified substring in a string with a new substring. + impls: + - args: + - name: str + value: string + - name: substr + value: string + - name: replacement + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: replace +- description: Reverses the character order of a string. + impls: + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + name: reverse +- description: Returns a specified number of characters from the right side of a string. + impls: + - args: + - name: str + value: string + - name: n + value: i64 + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + name: right +- description: Rounds a number to the nearest integer. + impls: + - args: + - name: numeric_expression + value: decimal + - name: decimal_places + value: i32 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: numeric_expression + value: decimal + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + - name: decimal_places + value: i32 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: numeric_expression + value: fp64 + - name: decimal_places + value: i32 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: round +- description: Pads the right side of a string with another string to a specified string length. + impls: + - args: + - name: str + value: string + - name: n + value: i64 + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: str + value: string + - name: n + value: i64 + - name: padding_str + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + name: rpad +- description: Trims the specified trim string from the end of a string. If no trim string is provided, all spaces are removed from the end of the input string. + impls: + - args: + - name: str + value: string + - name: trim_str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: rtrim +- description: |- + Returns the sign of a number. + Negative numbers return `-1`. + Zero and positive numbers return `1`. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: signum +- description: Returns the sine of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: sin +- description: Returns the hyperbolic sine of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: sinh +- description: Splits a string based on a specified delimiter and returns the substring in the specified position. + impls: + - args: + - name: str + value: string + - name: delimiter + value: string + - name: pos + value: i64 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: split_part +- description: Returns the square root of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: sqrt +- description: Tests if a string starts with a substring. + impls: + - args: + - name: str + value: string + - name: substr + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: starts_with +- description: Returns the starting position of a specified substring in a string. Positions begin at 1. If the substring does not exist in the string, the function returns 0. + impls: + - args: + - name: str + value: string + - name: substr + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + metadata: + datafusion: + aliases: + - instr + - position + name: strpos +- description: |- + Returns an Arrow struct using the specified input expressions optionally named. + Fields in the returned struct use the optional name or the `cN` naming convention. + For example: `c0`, `c1`, `c2`, etc. + For information on comparing and ordering struct values (including `NULL` handling), + see [Comparison and Ordering](struct_coercion.md#comparison-and-ordering). + impls: + - args: + - value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + metadata: + datafusion: + aliases: + - row + name: struct +- description: Extracts a substring of a specified number of characters from a specific starting position in a string. + impls: + - args: + - name: str + value: string + - name: start_pos + value: i64 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + - args: + - name: str + value: string + - name: start_pos + value: i64 + - name: length + value: i64 + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + metadata: + datafusion: + aliases: + - substring + name: substr +- description: |- + Returns the substring from str before count occurrences of the delimiter delim. + If count is positive, everything to the left of the final delimiter (counting from the left) is returned. + If count is negative, everything to the right of the final delimiter (counting from the right) is returned. + impls: + - args: + - name: str + value: string + - name: delim + value: string + - name: count + value: i64 + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + metadata: + datafusion: + aliases: + - substring_index + name: substr_index +- description: Returns the tangent of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: tan +- description: Returns the hyperbolic tangent of a number. + impls: + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: tanh +- description: Returns a string representation of a date, time, timestamp or duration based on a [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html). Unlike the PostgreSQL equivalent of this function numerical formatting is not supported. + impls: + - args: + - name: expression + value: date + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: time + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: time + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: time + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: time + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: timestamp_tz + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: timestamp + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: timestamp_tz + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: timestamp + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: timestamp_tz + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: timestamp + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: timestamp_tz + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: timestamp + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: duration + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: duration + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: duration + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + - args: + - name: expression + value: duration + - name: format + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + metadata: + datafusion: + aliases: + - date_format + name: to_char +- description: |- + Converts a value to a date (`YYYY-MM-DD`). + Supports strings, numeric and timestamp types as input. + Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. + Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`). + Returns the corresponding date. + + Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`. + impls: + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + name: to_date +- description: Converts an integer to a hexadecimal string. + impls: + - args: + - name: int + value: integer + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: to_hex +- description: Converts a timestamp with a timezone to a timestamp without a timezone (with no offset or timezone information). This function handles daylight saving time changes. + impls: + - args: + - name: expression + value: timestamp + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: to_local_time +- description: |- + Converts a value to a time (`HH:MM:SS.nnnnnnnnn`). + Supports strings and timestamps as input. + Strings are parsed as `HH:MM:SS`, `HH:MM:SS.nnnnnnnnn`, or `HH:MM` if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. + Timestamps will have the time portion extracted. + Returns the corresponding time. + + Note: `to_time` returns Time64(Nanosecond), which represents the time of day in nanoseconds since midnight. + impls: + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + name: to_time +- description: "\nConverts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000`) in the session time zone. Supports strings,\ninteger, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')\nif no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. \nStrings that parse without a time zone are treated as if they are in the\nsession time zone, or UTC if no session time zone is set.\nIntegers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`).\n\nNote: `to_timestamp` returns `Timestamp(ns, TimeZone)` where the time zone is the session time zone. The supported range\nfor integer input is between`-9223372037` and `9223372036`. Supported range for string input is between\n`1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds`\nfor the input outside of supported bounds.\n\nThe session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.\nThe time zone can be a value like +00:00, 'Europe/London' etc.\n" + impls: + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + name: to_timestamp +- description: "\nConverts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000`) in the session time zone. Supports strings,\ninteger, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')\nif no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. \nStrings that parse without a time zone are treated as if they are in the\nsession time zone, or UTC if no session time zone is set.\nIntegers, unsigned integers, and doubles are interpreted as microseconds since the unix epoch (`1970-01-01T00:00:00Z`).\n\nThe session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.\nThe time zone can be a value like +00:00, 'Europe/London' etc.\n" + impls: + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + name: to_timestamp_micros +- description: "\nConverts a value to a timestamp (`YYYY-MM-DDT00:00:00.000`) in the session time zone. Supports strings,\ninteger, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')\nif no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. \nStrings that parse without a time zone are treated as if they are in the\nsession time zone, or UTC if no session time zone is set.\nIntegers, unsigned integers, and doubles are interpreted as milliseconds since the unix epoch (`1970-01-01T00:00:00Z`).\n\nThe session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.\nThe time zone can be a value like +00:00, 'Europe/London' etc.\n" + impls: + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + name: to_timestamp_millis +- description: "\nConverts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000`) in the session time zone. Supports strings,\ninteger, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')\nif no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. \nStrings that parse without a time zone are treated as if they are in the\nsession time zone. Integers, unsigned integers, and doubles are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`).\n\nThe session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.\nThe time zone can be a value like +00:00, 'Europe/London' etc.\n" + impls: + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + name: to_timestamp_nanos +- description: "\nConverts a value to a timestamp (`YYYY-MM-DDT00:00:00`) in the session time zone. Supports strings,\ninteger, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00')\nif no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. \nStrings that parse without a time zone are treated as if they are in the\nsession time zone, or UTC if no session time zone is set.\nIntegers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`).\n\nThe session time zone can be set using the statement `SET TIMEZONE = 'desired time zone'`.\nThe time zone can be a value like +00:00, 'Europe/London' etc.\n" + impls: + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + name: to_timestamp_seconds +- description: |2- + + Converts a value to seconds since the unix epoch (`1970-01-01T00:00:00`). + Supports strings, dates, timestamps, integer, unsigned integer, and float types as input. + Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') + if no [Chrono formats](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) are provided. + Integers, unsigned integers, and floats are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00`). + impls: + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + name: to_unixtime +- description: Performs character-wise substitution based on a mapping. + impls: + - args: + - name: str + value: string + - name: from + value: string + - name: to + value: string + deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + name: translate +- description: Truncates a number to a whole number or truncated to the specified decimal places. + impls: + - args: + - name: numeric_expression + value: fp32 + - name: decimal_places + value: i64 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + - args: + - name: numeric_expression + value: fp64 + - name: decimal_places + value: i64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp64 + deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + - args: + - name: numeric_expression + value: fp32 + deterministic: true + nullability: MIRROR + return: fp32 + sessionDependent: false + name: trunc +- description: Casts the first argument to the data type of the second argument, returning NULL if the cast fails. Only the type of the second argument is used; its value is ignored. + impls: + - args: + - name: expression + value: any + - name: reference + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: try_cast_to_type +- description: Returns the value of the given field in the union when selected, or NULL otherwise. + impls: + - args: + - name: union + value: any + - name: field_name + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: union_extract +- description: Returns the name of the currently selected field in the union + impls: + - args: + - name: union + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: union_tag +- description: Converts a string to upper-case. + impls: + - args: + - name: str + value: string + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + name: upper +- description: Returns [`UUID v4`](https://en.wikipedia.org/wiki/Universally_unique_identifier#Version_4_%28random%29) string value which is unique per row. + impls: + - deterministic: false + nullability: MIRROR + return: string + sessionDependent: false + name: uuid +- description: Returns the version of DataFusion. + impls: + - deterministic: true + nullability: MIRROR + return: string + sessionDependent: false + name: version +- description: Attaches Arrow field metadata (key/value pairs) to the input expression. Keys must be non-empty constant strings and values must be constant strings (empty values are allowed). Existing metadata on the input field is preserved; new keys overwrite on collision. This is the inverse of `arrow_metadata`. + impls: + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + variadic: + min: 1.0 + parameterConsistency: INCONSISTENT + name: with_metadata +urn: extension:org.apache.datafusion:functions +window_functions: +- description: 'Relative rank of the current row: (number of rows preceding or peer with the current row) / (total rows).' + impls: + - deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + window_type: PARTITION + name: cume_dist +- description: Returns the rank of the current row without gaps. This function ranks rows in a dense manner, meaning consecutive ranks are assigned even for identical values. + impls: + - deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + window_type: PARTITION + name: dense_rank +- description: Returns value evaluated at the row that is the first row of the window frame. + impls: + - deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: any + - value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + name: first_value +- description: Returns value evaluated at the row that is offset rows before the current row within the partition; if there is no such row, instead return default (which must be of the same type as value). + impls: + - args: + - name: expr + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + - args: + - name: expr + value: any + - name: offset + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + - args: + - name: expr + value: any + - name: offset + value: any + - name: default + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + name: lag +- description: Returns value evaluated at the row that is the last row of the window frame. + impls: + - deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: any + - value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + name: last_value +- description: Returns value evaluated at the row that is offset rows after the current row within the partition; if there is no such row, instead return default (which must be of the same type as value). + impls: + - args: + - name: expr + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + - args: + - name: expr + value: any + - name: offset + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + - args: + - name: expr + value: any + - name: offset + value: any + - name: default + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + name: lead +- description: Returns the value evaluated at the nth row of the window frame (counting from 1). Returns NULL if no such row exists. + impls: + - deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: any + - name: n + value: any + deterministic: true + nullability: MIRROR + return: any + sessionDependent: false + window_type: PARTITION + name: nth_value +- description: Integer ranging from 1 to the argument value, dividing the partition as equally as possible + impls: + - args: + - name: expression + value: u64 + deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: u32 + deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: u16 + deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: u8 + deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: i64 + deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: i32 + deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: i16 + deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + window_type: PARTITION + - args: + - name: expression + value: i8 + deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + window_type: PARTITION + name: ntile +- description: Returns the percentage rank of the current row within its partition. The value ranges from 0 to 1 and is computed as `(rank - 1) / (total_rows - 1)`. + impls: + - deterministic: true + nullability: MIRROR + return: fp64 + sessionDependent: false + window_type: PARTITION + name: percent_rank +- description: Returns the rank of the current row within its partition, allowing gaps between ranks. This function provides a ranking similar to `row_number`, but skips ranks for identical values. + impls: + - deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + window_type: PARTITION + name: rank +- description: Number of the current row within its partition, counting from 1. + impls: + - deterministic: true + nullability: MIRROR + return: u64 + sessionDependent: false + window_type: PARTITION + name: row_number diff --git a/datafusion/substrait/src/bin/print_substrait_function_yaml.rs b/datafusion/substrait/src/bin/print_substrait_function_yaml.rs new file mode 100644 index 0000000000000..848f1bfc6aa95 --- /dev/null +++ b/datafusion/substrait/src/bin/print_substrait_function_yaml.rs @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::common::{DataFusionError, Result, exec_err}; +use datafusion_substrait::function_yaml::generate_function_extension; +use std::env; +use std::fs; +use std::io::{self, Write}; +use std::path::Path; + +const DATAFUSION_FUNCTIONS_PATH: &str = + "datafusion/substrait/extensions/functions_datafusion.yaml"; + +fn main() -> Result<()> { + let args: Vec = env::args().skip(1).collect(); + let contents = generated_yaml()?; + + match args.as_slice() { + [] => print_file(DATAFUSION_FUNCTIONS_PATH, &contents), + [flag] if flag == "--check" => check_file(DATAFUSION_FUNCTIONS_PATH, &contents), + [flag] if flag == "--write" => write_file(DATAFUSION_FUNCTIONS_PATH, &contents), + _ => exec_err!("Usage: print_substrait_function_yaml [--check|--write]"), + } +} + +fn generated_yaml() -> Result { + serde_yaml::to_string(&generate_function_extension()?) + .map_err(|e| DataFusionError::External(Box::new(e))) +} + +fn print_file(path: &str, contents: &str) -> Result<()> { + let mut stdout = io::stdout().lock(); + writeln!(stdout, "# {path}")?; + write!(stdout, "{contents}")?; + Ok(()) +} + +fn check_file(path: &str, contents: &str) -> Result<()> { + match fs::read_to_string(path) { + Ok(actual) if actual == contents => Ok(()), + Ok(_) => exec_err!( + "generated Substrait function YAML is out of date: {}. Run `cargo run -p datafusion-substrait --bin print_substrait_function_yaml -- --write`", + path + ), + Err(e) => exec_err!("failed to read {path}: {e}"), + } +} + +fn write_file(path: &str, contents: &str) -> Result<()> { + let path = Path::new(path); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + fs::write(path, contents)?; + Ok(()) +} diff --git a/datafusion/substrait/src/function_yaml/defaults.rs b/datafusion/substrait/src/function_yaml/defaults.rs new file mode 100644 index 0000000000000..d89c937355425 --- /dev/null +++ b/datafusion/substrait/src/function_yaml/defaults.rs @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! DataFusion-specific generation fallbacks. +//! +//! Most function declarations are inferred from DataFusion's runtime +//! `Signature` metadata. This module is the small catalog of cases that need +//! extra information to emit a complete Substrait simple extension file. +//! +//! Keep this module focused on DataFusion's built-in functions. Custom +//! registries should pass their own [`FunctionYamlOverrides`] through +//! [`FunctionYamlConfig`](super::FunctionYamlConfig). + +use super::{ + FunctionSignatureOverride, FunctionVariadicConsistency, + FunctionVariadicConsistency::{Consistent, Inconsistent}, + FunctionYamlOverrides, +}; + +struct ReturnTypeDefault { + function: &'static str, + return_type: &'static str, +} + +struct ImplementationDefault { + function: &'static str, + implementation: ImplementationShape, +} + +enum ImplementationShape { + Exact { + args: &'static [&'static str], + return_type: &'static str, + }, + Variadic { + args: &'static [&'static str], + return_type: &'static str, + min: usize, + consistency: FunctionVariadicConsistency, + }, +} + +// Dynamic return-type functions where DataFusion cannot infer a concrete Arrow +// type from the signature alone. +const RETURN_TYPE_DEFAULTS: &[ReturnTypeDefault] = &[ + ReturnTypeDefault { + function: "arrow_typeof", + return_type: "string", + }, + ReturnTypeDefault { + function: "arrow_typeof_legacy", + return_type: "string", + }, + ReturnTypeDefault { + function: "from_unixtime", + return_type: "timestamp", + }, + ReturnTypeDefault { + function: "now", + return_type: "timestamp", + }, + ReturnTypeDefault { + function: "random", + return_type: "fp64", + }, + ReturnTypeDefault { + function: "version", + return_type: "string", + }, +]; + +// Functions whose DataFusion signature is `UserDefined`. The generator cannot +// inspect those callbacks, so each entry records the Substrait implementation +// shape that should be emitted for DataFusion's default registry. +const IMPLEMENTATION_DEFAULTS: &[ImplementationDefault] = &[ + variadic_default("array", &["any1"], "list", 1, Consistent), + exact_default("array_add", &["list", "list"], "list"), + variadic_default("array_concat", &["list"], "list", 1, Consistent), + exact_default("array_distance", &["list", "list"], "fp64"), + exact_default("array_normalize", &["list"], "list"), + exact_default("array_scale", &["list", "fp64"], "list"), + exact_default("arrow_cast", &["any", "any"], "any"), + exact_default("arrow_try_cast", &["any", "any"], "any"), + exact_default("cast_to_type", &["any", "any"], "any"), + variadic_default("coalesce", &["any1"], "any1", 1, Consistent), + exact_default("cosine_distance", &["list", "list"], "fp64"), + variadic_default("get_field", &["any", "string"], "any", 1, Consistent), + variadic_default("greatest", &["any1"], "any1", 1, Consistent), + exact_default("inner_product", &["list", "list"], "fp64"), + variadic_default("least", &["any1"], "any1", 1, Consistent), + variadic_default("make_array", &["any"], "list", 1, Inconsistent), + exact_default("map_extract", &["map", "any1"], "list"), + exact_default("max", &["any1"], "any1"), + exact_default("min", &["any1"], "any1"), + variadic_default("named_struct", &["any"], "struct", 1, Inconsistent), + exact_default("nullif", &["any1", "any1"], "any1"), + exact_default("nvl", &["any1", "any1"], "any1"), + exact_default("nvl2", &["any", "any1", "any1"], "any1"), + variadic_default("struct", &["any"], "struct", 1, Inconsistent), + exact_default("try_cast_to_type", &["any", "any"], "any"), +]; + +/// Return the overrides needed to generate YAML for DataFusion's default +/// function inventory. +pub fn datafusion_overrides() -> FunctionYamlOverrides { + let mut overrides = FunctionYamlOverrides::default(); + + for default in IMPLEMENTATION_DEFAULTS { + overrides + .user_defined_signatures + .entry(default.function.to_string()) + .or_default() + .push(default.implementation.to_override()); + } + + for default in RETURN_TYPE_DEFAULTS { + overrides.return_types.insert( + default.function.to_string(), + default.return_type.to_string(), + ); + } + + overrides +} + +// Creates an override for one fixed implementation signature. +const fn exact_default( + function: &'static str, + args: &'static [&'static str], + return_type: &'static str, +) -> ImplementationDefault { + ImplementationDefault { + function, + implementation: ImplementationShape::Exact { args, return_type }, + } +} + +// Creates an override for one variadic implementation signature. `args` +// describes the repeated argument pattern emitted into Substrait; `min` is the +// minimum number of accepted values for that repeated pattern. +const fn variadic_default( + function: &'static str, + args: &'static [&'static str], + return_type: &'static str, + min: usize, + consistency: FunctionVariadicConsistency, +) -> ImplementationDefault { + ImplementationDefault { + function, + implementation: ImplementationShape::Variadic { + args, + return_type, + min, + consistency, + }, + } +} + +impl ImplementationShape { + fn to_override(&self) -> FunctionSignatureOverride { + match self { + Self::Exact { args, return_type } => { + FunctionSignatureOverride::new(type_strings(args), *return_type) + } + Self::Variadic { + args, + return_type, + min, + consistency, + } => FunctionSignatureOverride::variadic( + type_strings(args), + *return_type, + *min, + *consistency, + ), + } + } +} + +fn type_strings(args: &[&str]) -> Vec { + args.iter().map(|arg| (*arg).to_string()).collect() +} diff --git a/datafusion/substrait/src/function_yaml/functions.rs b/datafusion/substrait/src/function_yaml/functions.rs new file mode 100644 index 0000000000000..0b1a2e066de26 --- /dev/null +++ b/datafusion/substrait/src/function_yaml/functions.rs @@ -0,0 +1,370 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::function_yaml::FunctionYamlOverrides; +use crate::function_yaml::signatures::{ + return_type_or_any, signature_to_impls, type_name, +}; +use datafusion::arrow::datatypes::{DataType, Field}; +use datafusion::common::{HashSet, Result, internal_err}; +use datafusion::logical_expr::function::WindowUDFFieldArgs; +use datafusion::logical_expr::{ + AggregateUDF, Documentation, HigherOrderTypeSignature, HigherOrderUDF, ScalarUDF, + Signature, TypeSignature, ValueOrLambda, WindowUDF, +}; +use itertools::Itertools; +use serde_json::{Map, Value, json}; +use std::sync::Arc; +use substrait::text::simple_extensions as ext; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum FunctionKind { + Scalar, + Aggregate, + Window, +} + +impl FunctionKind { + fn yaml_key(self) -> &'static str { + match self { + Self::Scalar => "scalar_functions", + Self::Aggregate => "aggregate_functions", + Self::Window => "window_functions", + } + } +} + +pub fn collect_scalar_functions( + scalar_functions: &[Arc], + higher_order_functions: &[Arc], + overrides: &FunctionYamlOverrides, +) -> Result> { + let functions = scalar_functions + .iter() + .map(|f| scalar_function(f, overrides)) + .chain( + higher_order_functions + .iter() + .map(|f| higher_order_function(f, overrides)), + ) + .collect::>>()?; + sort_and_validate_scalar_functions(functions) +} + +pub fn collect_aggregate_functions( + aggregate_functions: &[Arc], + overrides: &FunctionYamlOverrides, +) -> Result> { + let functions = aggregate_functions + .iter() + .map(|f| aggregate_function(f, overrides)) + .collect::>>()?; + sort_and_validate_aggregate_functions(functions) +} + +pub fn collect_window_functions( + window_functions: &[Arc], + overrides: &FunctionYamlOverrides, +) -> Result> { + let functions = window_functions + .iter() + .map(|f| window_function(f, overrides)) + .collect::>>()?; + sort_and_validate_window_functions(functions) +} + +pub fn metadata_from_aliases(aliases: &[String]) -> Map { + let aliases: Vec = aliases.iter().sorted().cloned().collect(); + let mut metadata = Map::new(); + if !aliases.is_empty() { + metadata.insert("datafusion".to_string(), json!({ "aliases": aliases })); + } + metadata +} + +pub fn aggregate_impl( + function_name: &str, + nullable: bool, + implementation: ext::ScalarFunctionImplsItem, +) -> ext::AggregateFunctionImplsItem { + let return_type = type_name(&implementation.return_.0); + ext::AggregateFunctionImplsItem { + args: implementation.args, + decomposable: Some(aggregate_decomposable(function_name)), + deterministic: implementation.deterministic, + implementation: implementation.implementation, + intermediate: Some(ext::Intermediate(ext::Type::String( + aggregate_intermediate_type(function_name, return_type), + ))), + maxset: None, + nullability: Some(if nullable { + ext::NullabilityHandling::Mirror + } else { + ext::NullabilityHandling::DeclaredOutput + }), + options: implementation.options, + ordered: Some(ext::Ordered(false)), + return_: implementation.return_, + session_dependent: implementation.session_dependent, + variadic: implementation.variadic, + } +} + +pub fn aggregate_decomposable(function_name: &str) -> ext::Decomposable { + match function_name { + "median" + | "approx_median" + | "percentile_cont" + | "approx_percentile_cont" + | "approx_percentile_cont_with_weight" => ext::Decomposable::None, + _ => ext::Decomposable::Many, + } +} + +pub fn aggregate_intermediate_type(function_name: &str, return_type: &str) -> String { + match function_name { + "count" | "regr_count" | "approx_distinct" => "i64".to_string(), + _ => return_type.to_string(), + } +} + +fn scalar_function( + function: &Arc, + overrides: &FunctionYamlOverrides, +) -> Result { + let signature = function.signature(); + let documentation = function.documentation(); + let arg_names = argument_names(signature, documentation); + let impls = signature_to_impls( + function.name(), + signature, + &arg_names, + overrides, + |arg_types| { + return_type_or_any(function.name(), overrides, arg_types, || { + function.return_type(arg_types) + }) + }, + )?; + + Ok(ext::ScalarFunction { + description: documentation.map(|doc| doc.description.clone()), + impls, + metadata: metadata_from_aliases(function.aliases()), + name: function.name().to_string(), + }) +} + +fn aggregate_function( + function: &Arc, + overrides: &FunctionYamlOverrides, +) -> Result { + let signature = function.signature(); + let documentation = function.documentation(); + let arg_names = argument_names(signature, documentation); + let nullable = function.is_nullable(); + let impls = signature_to_impls( + function.name(), + signature, + &arg_names, + overrides, + |arg_types| { + return_type_or_any(function.name(), overrides, arg_types, || { + function.return_type(arg_types) + }) + }, + )? + .into_iter() + .map(|implementation| aggregate_impl(function.name(), nullable, implementation)) + .collect(); + + Ok(ext::AggregateFunction { + description: documentation.map(|doc| doc.description.clone()), + impls, + metadata: metadata_from_aliases(function.aliases()), + name: function.name().to_string(), + }) +} + +fn window_function( + function: &Arc, + overrides: &FunctionYamlOverrides, +) -> Result { + let signature = function.signature(); + let documentation = function.documentation(); + let arg_names = argument_names(signature, documentation); + let impls = signature_to_impls( + function.name(), + signature, + &arg_names, + overrides, + |arg_types| { + return_type_or_any(function.name(), overrides, arg_types, || { + window_return_type(function, arg_types) + }) + }, + )? + .into_iter() + .map(window_impl) + .collect(); + + Ok(ext::WindowFunction { + description: documentation.map(|doc| doc.description.clone()), + impls, + metadata: metadata_from_aliases(function.aliases()), + name: function.name().to_string(), + }) +} + +fn higher_order_function( + function: &Arc, + overrides: &FunctionYamlOverrides, +) -> Result { + let signature = higher_order_signature(function.signature()); + let documentation = function.documentation(); + let arg_names = argument_names(&signature, documentation); + let impls = signature_to_impls( + function.name(), + &signature, + &arg_names, + overrides, + |_arg_types| Ok("any".to_string()), + )?; + + Ok(ext::ScalarFunction { + description: documentation.map(|doc| doc.description.clone()), + impls, + metadata: metadata_from_aliases(function.aliases()), + name: function.name().to_string(), + }) +} + +fn higher_order_signature( + signature: &datafusion::logical_expr::HigherOrderSignature, +) -> Signature { + let type_signature = match &signature.type_signature { + HigherOrderTypeSignature::UserDefined => TypeSignature::UserDefined, + HigherOrderTypeSignature::VariadicAny => TypeSignature::VariadicAny, + HigherOrderTypeSignature::Any(count) => TypeSignature::Any(*count), + // Both Value and Lambda are opaque at the Substrait level; represent as Null. + HigherOrderTypeSignature::Exact(args) => TypeSignature::Exact( + args.iter() + .map(|arg| match arg { + ValueOrLambda::Value(()) | ValueOrLambda::Lambda(()) => { + DataType::Null + } + }) + .collect(), + ), + }; + Signature::new(type_signature, signature.volatility) +} + +fn argument_names( + signature: &Signature, + documentation: Option<&Documentation>, +) -> Vec { + if let Some(parameter_names) = &signature.parameter_names { + return parameter_names.clone(); + } + + documentation + .and_then(|doc| doc.arguments.as_ref()) + .map(|args| args.iter().map(|(name, _)| name.clone()).collect()) + .unwrap_or_default() +} + +fn window_impl( + implementation: ext::ScalarFunctionImplsItem, +) -> ext::WindowFunctionImplsItem { + ext::WindowFunctionImplsItem { + args: implementation.args, + decomposable: None, + deterministic: implementation.deterministic, + implementation: implementation.implementation, + intermediate: None, + maxset: None, + nullability: implementation.nullability, + options: implementation.options, + ordered: None, + return_: implementation.return_, + session_dependent: implementation.session_dependent, + variadic: implementation.variadic, + window_type: Some(ext::WindowFunctionImplsItemWindowType::Partition), + } +} + +fn window_return_type( + function: &Arc, + arg_types: &[DataType], +) -> Result { + let fields: Vec<_> = arg_types + .iter() + .enumerate() + .map(|(idx, data_type)| { + Arc::new(Field::new(format!("arg_{idx}"), data_type.clone(), true)) + }) + .collect(); + let field = function.field(WindowUDFFieldArgs::new(&fields, function.name()))?; + Ok(field.data_type().clone()) +} + +fn sort_and_validate_scalar_functions( + mut functions: Vec, +) -> Result> { + functions.sort_by(|left, right| left.name.cmp(&right.name)); + validate_unique_names( + functions.iter().map(|function| function.name.as_str()), + FunctionKind::Scalar, + )?; + Ok(functions) +} + +fn sort_and_validate_aggregate_functions( + mut functions: Vec, +) -> Result> { + functions.sort_by(|left, right| left.name.cmp(&right.name)); + validate_unique_names( + functions.iter().map(|function| function.name.as_str()), + FunctionKind::Aggregate, + )?; + Ok(functions) +} + +fn sort_and_validate_window_functions( + mut functions: Vec, +) -> Result> { + functions.sort_by(|left, right| left.name.cmp(&right.name)); + validate_unique_names( + functions.iter().map(|function| function.name.as_str()), + FunctionKind::Window, + )?; + Ok(functions) +} + +fn validate_unique_names<'a>( + names: impl IntoIterator, + kind: FunctionKind, +) -> Result<()> { + let mut seen: HashSet<&str> = HashSet::new(); + for name in names { + if !seen.insert(name) { + return internal_err!("duplicate {} function `{}`", kind.yaml_key(), name); + } + } + Ok(()) +} diff --git a/datafusion/substrait/src/function_yaml/mod.rs b/datafusion/substrait/src/function_yaml/mod.rs new file mode 100644 index 0000000000000..915922d6ca01d --- /dev/null +++ b/datafusion/substrait/src/function_yaml/mod.rs @@ -0,0 +1,230 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Generate Substrait simple extension YAML declarations for DataFusion functions. +//! +//! The default generator uses the same runtime inventory as the function +//! documentation binary, so committed YAML tracks the functions registered in a +//! default [`SessionStateDefaults`]. Downstream crates can pass their own +//! [`FunctionYamlInventory`] and [`FunctionYamlConfig`] to build a Substrait +//! extension declaration for custom UDFs while reusing the same signature +//! mapping. +//! +//! The generator intentionally fails when it cannot infer a complete Substrait +//! declaration. Use [`FunctionYamlOverrides`] for signatures or return types +//! that cannot be derived from DataFusion's `Signature` metadata. + +use datafusion::common::Result; +use datafusion::execution::SessionStateDefaults; +use datafusion::logical_expr::{AggregateUDF, HigherOrderUDF, ScalarUDF, WindowUDF}; +use std::collections::BTreeMap; +use std::sync::Arc; + +mod defaults; +mod functions; +mod signatures; +#[cfg(test)] +mod tests; +mod types; + +use defaults::datafusion_overrides; +use functions::{ + collect_aggregate_functions, collect_scalar_functions, collect_window_functions, +}; +use substrait::text::simple_extensions as ext; + +const DATAFUSION_FUNCTIONS_URN: &str = "extension:org.apache.datafusion:functions"; + +/// Functions to include in a generated Substrait extension declaration. +/// +/// This type is public so consumers can generate Substrait extension YAML for +/// custom function registries, not only DataFusion's built-in defaults. +#[derive(Debug, Clone)] +pub struct FunctionYamlInventory { + /// Scalar UDFs to emit into the extension file. + pub scalar_functions: Vec>, + /// Higher-order UDFs to emit into the scalar function declarations. + pub higher_order_functions: Vec>, + /// Aggregate UDFs to emit into the extension file. + pub aggregate_functions: Vec>, + /// Window UDFs to emit into the extension file. + pub window_functions: Vec>, +} + +impl FunctionYamlInventory { + /// Return the function inventory registered by a default DataFusion session. + pub fn datafusion_defaults() -> Self { + Self { + scalar_functions: SessionStateDefaults::default_scalar_functions(), + higher_order_functions: SessionStateDefaults::default_higher_order_functions( + ), + aggregate_functions: SessionStateDefaults::default_aggregate_functions(), + window_functions: SessionStateDefaults::default_window_functions(), + } + } +} + +/// Configuration for generated Substrait extension declarations. +/// +/// The default values produce DataFusion's built-in function declarations. +/// Custom callers can replace URNs and inference overrides while using the same +/// generation pipeline. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FunctionYamlConfig { + /// URN written to the extension declaration. + pub urn: String, + /// Explicit mappings for signatures and return types that are not inferable. + pub overrides: FunctionYamlOverrides, +} + +impl Default for FunctionYamlConfig { + fn default() -> Self { + Self { + urn: DATAFUSION_FUNCTIONS_URN.to_string(), + overrides: FunctionYamlOverrides::datafusion_defaults(), + } + } +} + +/// Explicit mappings for function declarations that are not inferable. +/// +/// DataFusion signatures generally describe argument shape and volatility, but +/// `UserDefined` signatures and some dynamic return types need explicit +/// Substrait type strings. Keys are canonical function names. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct FunctionYamlOverrides { + /// Complete implementation declarations for `TypeSignature::UserDefined`. + pub user_defined_signatures: BTreeMap>, + /// Return type fallback for functions whose return type cannot be inferred. + pub return_types: BTreeMap, +} + +impl FunctionYamlOverrides { + /// Return the overrides required by DataFusion's default function set. + pub fn datafusion_defaults() -> Self { + datafusion_overrides() + } +} + +/// Override for a single Substrait function implementation signature. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FunctionSignatureOverride { + /// Substrait argument type strings, in function argument order. + pub args: Vec, + /// Substrait return type string. + pub return_type: String, + /// Variadic metadata when the implementation accepts repeated arguments. + pub variadic: Option, +} + +impl FunctionSignatureOverride { + /// Create a non-variadic implementation override. + pub fn new(args: Vec, return_type: impl Into) -> Self { + Self { + args, + return_type: return_type.into(), + variadic: None, + } + } + + /// Create a variadic implementation override. + pub fn variadic( + args: Vec, + return_type: impl Into, + min: usize, + parameter_consistency: FunctionVariadicConsistency, + ) -> Self { + Self { + args, + return_type: return_type.into(), + variadic: Some(FunctionVariadicOverride { + min, + parameter_consistency, + }), + } + } +} + +/// Variadic metadata for a Substrait implementation override. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FunctionVariadicOverride { + /// Minimum number of values accepted by the variadic argument. + pub min: usize, + /// Whether all repeated values must use a consistent type parameter. + pub parameter_consistency: FunctionVariadicConsistency, +} + +/// Substrait parameter consistency for variadic function arguments. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FunctionVariadicConsistency { + /// Repeated variadic arguments share the same type parameter. + Consistent, + /// Repeated variadic arguments can use independent types. + Inconsistent, +} + +impl From for ext::VariadicBehaviorParameterConsistency { + fn from(c: FunctionVariadicConsistency) -> Self { + match c { + FunctionVariadicConsistency::Consistent => { + ext::VariadicBehaviorParameterConsistency::Consistent + } + FunctionVariadicConsistency::Inconsistent => { + ext::VariadicBehaviorParameterConsistency::Inconsistent + } + } + } +} + +/// Generate the DataFusion Substrait function extension declaration. +pub fn generate_function_extension() -> Result { + generate_function_extension_for_inventory( + &FunctionYamlInventory::datafusion_defaults(), + &FunctionYamlConfig::default(), + ) +} + +/// Generate a Substrait function extension declaration for an explicit inventory. +/// +/// This is the primary API for consumers that want to build declarations for a custom +/// function registry. [`generate_function_extension`] is a convenience wrapper +/// that calls this function with DataFusion's default inventory and config. +pub fn generate_function_extension_for_inventory( + inventory: &FunctionYamlInventory, + config: &FunctionYamlConfig, +) -> Result { + let scalar = collect_scalar_functions( + &inventory.scalar_functions, + &inventory.higher_order_functions, + &config.overrides, + )?; + let aggregate = + collect_aggregate_functions(&inventory.aggregate_functions, &config.overrides)?; + let window = + collect_window_functions(&inventory.window_functions, &config.overrides)?; + + Ok(ext::SimpleExtensions { + aggregate_functions: aggregate, + dependencies: Default::default(), + metadata: Default::default(), + scalar_functions: scalar, + type_variations: vec![], + types: vec![], + urn: config.urn.clone(), + window_functions: window, + }) +} diff --git a/datafusion/substrait/src/function_yaml/signatures.rs b/datafusion/substrait/src/function_yaml/signatures.rs new file mode 100644 index 0000000000000..3b6dd99289c6d --- /dev/null +++ b/datafusion/substrait/src/function_yaml/signatures.rs @@ -0,0 +1,495 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::function_yaml::FunctionYamlOverrides; +use datafusion::arrow::datatypes::DataType; +use datafusion::common::{DataFusionError, HashSet, Result, internal_err}; +use datafusion::logical_expr::{ + ArrayFunctionArgument, ArrayFunctionSignature, Coercion, Signature, TypeSignature, +}; +use substrait::text::simple_extensions as ext; + +use super::types::{ + arrow_type_to_substrait, type_class_to_substrait, volatility_to_substrait, +}; + +pub fn signature_to_impls( + function_name: &str, + signature: &Signature, + arg_names: &[String], + overrides: &FunctionYamlOverrides, + return_type: F, +) -> Result> +where + F: Fn(&[DataType]) -> Result, +{ + let mut impls = type_signature_to_impls( + function_name, + &signature.type_signature, + arg_names, + overrides, + &return_type, + )?; + + let (deterministic, session_dependent) = + volatility_to_substrait(&signature.volatility); + for implementation in &mut impls { + implementation.deterministic = Some(ext::Deterministic(deterministic)); + implementation.session_dependent = Some(ext::SessionDependent(session_dependent)); + } + + // Dedup collapses impls whose Arrow types map to the same Substrait type + // (e.g. Utf8/LargeUtf8/Utf8View all become "string"). + deduplicate_impls(&mut impls); + Ok(impls) +} + +pub fn return_type_or_any( + function_name: &str, + overrides: &FunctionYamlOverrides, + arg_types: &[DataType], + f: F, +) -> Result +where + F: FnOnce() -> Result, +{ + // When arg types are not yet resolved (Null placeholders), emit "any" rather + // than calling the return-type closure, which would likely fail or mislead. + if arg_types + .iter() + .any(|data_type| data_type == &DataType::Null) + { + return Ok("any".to_string()); + } + + match f().and_then(|data_type| arrow_type_to_substrait(&data_type)) { + Ok(data_type) => Ok(data_type), + Err(_) => overrides + .return_types + .get(function_name) + .cloned() + .ok_or_else(|| { + DataFusionError::Internal(format!( + "cannot infer Substrait return type for `{function_name}`" + )) + }), + } +} + +pub fn arg_yaml(idx: usize, arg_names: &[String], value: String) -> ext::ArgumentsItem { + // Names that are not YAML plain-safe are dropped rather than quoted, + // because quoted scalar keys interact poorly with some Substrait parsers. + ext::ValueArg { + constant: None, + description: None, + name: arg_names + .get(idx) + .cloned() + .filter(|name| is_yaml_plain_safe(name)), + value: ext::Type::String(value), + } + .into() +} + +pub fn function_impl( + args: Vec, + variadic: Option, + return_type: String, +) -> ext::ScalarFunctionImplsItem { + ext::ScalarFunctionImplsItem { + args: (!args.is_empty()).then_some(ext::Arguments(args)), + deterministic: None, + implementation: None, + nullability: Some(ext::NullabilityHandling::Mirror), + options: None, + return_: ext::ReturnValue(ext::Type::String(return_type)), + session_dependent: None, + variadic, + } +} + +#[cfg(test)] +pub fn detect_duplicate_impls( + function_name: &str, + impls: &[ext::ScalarFunctionImplsItem], +) -> Result<()> { + let mut signatures = HashSet::new(); + for implementation in impls { + let key = signature_key(implementation); + if !signatures.insert(key) { + return internal_err!( + "duplicate Substrait implementation signature for `{function_name}`" + ); + } + } + Ok(()) +} + +pub fn type_name(value: &ext::Type) -> &str { + match value { + ext::Type::String(value) => value, + ext::Type::Object(_) => "object", + } +} + +fn type_signature_to_impls( + function_name: &str, + signature: &TypeSignature, + arg_names: &[String], + overrides: &FunctionYamlOverrides, + return_type: &F, +) -> Result> +where + F: Fn(&[DataType]) -> Result, +{ + match signature { + TypeSignature::Nullary => { + Ok(vec![function_impl(vec![], None, return_type(&[])?)]) + } + TypeSignature::Exact(types) => Ok(vec![function_impl( + typed_args(types, arg_names)?, + None, + return_type(types)?, + )]), + TypeSignature::Uniform(count, types) => types + .iter() + .map(|data_type| { + let arg_types = vec![data_type.clone(); *count]; + let return_type = return_type(&arg_types) + .or_else(|_| arrow_type_to_substrait(data_type))?; + Ok(function_impl( + typed_args(&arg_types, arg_names)?, + None, + return_type, + )) + }) + .collect(), + TypeSignature::OneOf(signatures) => signatures + .iter() + .map(|signature| { + type_signature_to_impls( + function_name, + signature, + arg_names, + overrides, + return_type, + ) + }) + .try_flatten_vec(), + TypeSignature::Variadic(types) => types + .iter() + .map(|data_type| { + Ok(function_impl( + vec![arg_yaml(0, arg_names, arrow_type_to_substrait(data_type)?)], + Some(variadic( + 1, + ext::VariadicBehaviorParameterConsistency::Consistent, + )), + return_type(std::slice::from_ref(data_type))?, + )) + }) + .collect(), + TypeSignature::VariadicAny => Ok(vec![function_impl( + vec![arg_yaml(0, arg_names, "any".to_string())], + Some(variadic( + 1, + ext::VariadicBehaviorParameterConsistency::Inconsistent, + )), + "any".to_string(), + )]), + TypeSignature::Any(count) => Ok(vec![function_impl( + (0..*count) + .map(|idx| arg_yaml(idx, arg_names, "any".to_string())) + .collect(), + None, + "any".to_string(), + )]), + TypeSignature::Comparable(count) => Ok(vec![function_impl( + (0..*count) + .map(|idx| arg_yaml(idx, arg_names, "any1".to_string())) + .collect(), + None, + "any1".to_string(), + )]), + TypeSignature::Numeric(count) => { + let types = [ + DataType::Int8, + DataType::Int16, + DataType::Int32, + DataType::Int64, + DataType::UInt8, + DataType::UInt16, + DataType::UInt32, + DataType::UInt64, + DataType::Float32, + DataType::Float64, + ]; + uniform_impls(&types, *count, arg_names, return_type) + } + TypeSignature::String(count) => { + let types = [DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View]; + uniform_impls(&types, *count, arg_names, return_type) + } + TypeSignature::Coercible(coercions) => Ok(vec![function_impl( + coercion_args(coercions, arg_names)?, + None, + "any".to_string(), + )]), + TypeSignature::ArraySignature(array_signature) => Ok(vec![function_impl( + array_signature_args(array_signature, arg_names), + None, + array_signature_return(function_name), + )]), + TypeSignature::UserDefined => { + user_defined_override(function_name, arg_names, overrides) + } + } +} + +fn uniform_impls( + types: &[DataType], + count: usize, + arg_names: &[String], + return_type: &F, +) -> Result> +where + F: Fn(&[DataType]) -> Result, +{ + types + .iter() + .map(|data_type| { + let arg_types = vec![data_type.clone(); count]; + let return_type = return_type(&arg_types) + .or_else(|_| arrow_type_to_substrait(data_type))?; + Ok(function_impl( + typed_args(&arg_types, arg_names)?, + None, + return_type, + )) + }) + .collect() +} + +fn typed_args( + types: &[DataType], + arg_names: &[String], +) -> Result> { + types + .iter() + .enumerate() + .map(|(idx, data_type)| { + Ok(arg_yaml( + idx, + arg_names, + arrow_type_to_substrait(data_type)?, + )) + }) + .collect() +} + +fn coercion_args( + coercions: &[Coercion], + arg_names: &[String], +) -> Result> { + coercions + .iter() + .enumerate() + .map(|(idx, coercion)| { + let value = match coercion { + Coercion::Exact { desired_type } + | Coercion::Implicit { desired_type, .. } => { + type_class_to_substrait(desired_type)? + } + }; + Ok(arg_yaml(idx, arg_names, value)) + }) + .collect() +} + +fn array_signature_args( + signature: &ArrayFunctionSignature, + arg_names: &[String], +) -> Vec { + match signature { + ArrayFunctionSignature::Array { arguments, .. } => arguments + .iter() + .enumerate() + .map(|(idx, arg)| { + let value = match arg { + ArrayFunctionArgument::Element => "any1", + ArrayFunctionArgument::Index => "i64", + ArrayFunctionArgument::Array => "list", + ArrayFunctionArgument::String => "string", + }; + arg_yaml(idx, arg_names, value.to_string()) + }) + .collect(), + ArrayFunctionSignature::RecursiveArray => { + vec![arg_yaml(0, arg_names, "list".to_string())] + } + ArrayFunctionSignature::MapArray => { + vec![arg_yaml(0, arg_names, "map".to_string())] + } + } +} + +fn array_signature_return(function_name: &str) -> String { + match function_name { + "array_length" | "cardinality" => "i64".to_string(), + "array_empty" | "empty" => "boolean".to_string(), + "array_has" | "array_has_all" | "array_has_any" | "array_any_match" + | "array_all_match" => "boolean".to_string(), + "array_element" | "array_pop_front" | "array_pop_back" => "any1".to_string(), + _ => "list".to_string(), + } +} + +fn user_defined_override( + function_name: &str, + arg_names: &[String], + overrides: &FunctionYamlOverrides, +) -> Result> { + let Some(override_signatures) = overrides.user_defined_signatures.get(function_name) + else { + return internal_err!( + "no Substrait function YAML override for UserDefined signature `{function_name}`" + ); + }; + + override_signatures + .iter() + .map(|override_signature| { + Ok(function_impl( + override_signature + .args + .iter() + .enumerate() + .map(|(idx, value)| arg_yaml(idx, arg_names, value.clone())) + .collect(), + override_signature + .variadic + .as_ref() + .map(|variadic_override| { + variadic( + variadic_override.min, + ext::VariadicBehaviorParameterConsistency::from( + variadic_override.parameter_consistency, + ), + ) + }), + override_signature.return_type.clone(), + )) + }) + .collect() +} + +fn variadic( + min: usize, + parameter_consistency: ext::VariadicBehaviorParameterConsistency, +) -> ext::VariadicBehavior { + ext::VariadicBehavior { + max: None, + min: Some(min as f64), + parameter_consistency: Some(parameter_consistency), + } +} + +fn deduplicate_impls(impls: &mut Vec) { + let mut seen = HashSet::new(); + impls.retain(|implementation| seen.insert(signature_key(implementation))); +} + +#[derive(Hash, PartialEq, Eq)] +struct ArgumentKey { + name: Option, + type_repr: String, +} + +#[derive(Hash, PartialEq, Eq)] +struct VariadicKey { + min_bits: Option, + parameter_consistency: Option, +} + +#[derive(Hash, PartialEq, Eq)] +struct SignatureKey { + args: Vec, + variadic: Option, + return_type: String, +} + +fn signature_key(implementation: &ext::ScalarFunctionImplsItem) -> SignatureKey { + SignatureKey { + args: implementation + .args + .as_ref() + .map(|args| args.iter().map(argument_key).collect()) + .unwrap_or_default(), + variadic: implementation + .variadic + .as_ref() + .map(|variadic| VariadicKey { + min_bits: variadic.min.map(f64::to_bits), + parameter_consistency: variadic + .parameter_consistency + .map(|consistency| consistency.to_string()), + }), + return_type: type_name(&implementation.return_.0).to_string(), + } +} + +fn argument_key(argument: &ext::ArgumentsItem) -> ArgumentKey { + match argument { + ext::ArgumentsItem::ValueArg(arg) => ArgumentKey { + name: arg.name.clone(), + type_repr: type_name(&arg.value).to_string(), + }, + ext::ArgumentsItem::EnumerationArg(arg) => ArgumentKey { + name: arg.name.clone(), + type_repr: format!("{:?}", arg.options), + }, + ext::ArgumentsItem::TypeArg(arg) => ArgumentKey { + name: arg.name.clone(), + type_repr: arg.type_.clone(), + }, + } +} + +fn is_yaml_plain_safe(value: &str) -> bool { + !value.is_empty() + && value.chars().all(|c| { + c.is_ascii_alphanumeric() || matches!(c, '_' | '-' | '<' | '>' | ',' | '?') + }) + && !matches!(value, "true" | "false" | "null" | "TRUE" | "FALSE" | "NULL") +} + +trait TryFlattenVec { + fn try_flatten_vec(self) -> Result>; +} + +impl TryFlattenVec for I +where + I: Iterator>>, +{ + fn try_flatten_vec(self) -> Result> { + let mut out = vec![]; + for item in self { + out.extend(item?); + } + Ok(out) + } +} diff --git a/datafusion/substrait/src/function_yaml/tests.rs b/datafusion/substrait/src/function_yaml/tests.rs new file mode 100644 index 0000000000000..a5336e1132dd8 --- /dev/null +++ b/datafusion/substrait/src/function_yaml/tests.rs @@ -0,0 +1,317 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::functions::{ + aggregate_decomposable, aggregate_impl, aggregate_intermediate_type, + metadata_from_aliases, +}; +use super::signatures::{ + arg_yaml, detect_duplicate_impls, function_impl, return_type_or_any, + signature_to_impls, type_name, +}; +use super::types::{ + arrow_type_to_substrait, type_class_to_substrait, volatility_to_substrait, +}; +use super::*; +use datafusion::arrow::datatypes::{DataType, Field, Fields, IntervalUnit, TimeUnit}; +use datafusion::common::types::logical_float64; +use datafusion::common::{DataFusionError, Result}; +use datafusion::logical_expr::{ + Signature, TypeSignature, TypeSignatureClass, Volatility, +}; +use std::sync::Arc; +use substrait::text::simple_extensions as ext; + +fn scalar_impls( + signature: Signature, + return_type: F, +) -> Vec +where + F: Fn(&[DataType]) -> Result, +{ + signature_to_impls( + "test", + &signature, + &["x".to_string(), "y".to_string()], + &FunctionYamlOverrides::default(), + return_type, + ) + .unwrap() +} + +fn args(implementation: &ext::ScalarFunctionImplsItem) -> &[ext::ArgumentsItem] { + implementation + .args + .as_ref() + .map(|args| args.as_slice()) + .unwrap_or_default() +} + +fn value_arg(argument: &ext::ArgumentsItem) -> (&Option, &str) { + match argument { + ext::ArgumentsItem::ValueArg(arg) => (&arg.name, type_name(&arg.value)), + _ => panic!("expected value arg"), + } +} + +fn return_type(implementation: &ext::ScalarFunctionImplsItem) -> &str { + type_name(&implementation.return_.0) +} + +#[test] +fn maps_arrow_types_to_substrait_types() { + let cases: &[(&DataType, &str)] = &[ + (&DataType::Boolean, "boolean"), + (&DataType::Int32, "i32"), + (&DataType::UInt64, "u64"), + (&DataType::Float64, "fp64"), + (&DataType::Utf8View, "string"), + (&DataType::BinaryView, "binary"), + ( + &DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + "timestamp_tz", + ), + (&DataType::Duration(TimeUnit::Millisecond), "duration"), + ( + &DataType::Interval(IntervalUnit::MonthDayNano), + "interval_month_day_nano", + ), + (&DataType::Decimal128(10, 3), "decimal<10,3>"), + ( + &DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + "list", + ), + ( + &DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Utf8, true), + ])), + "struct", + ), + ( + &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + "string", + ), + ]; + + for (input, expected) in cases { + assert_eq!( + arrow_type_to_substrait(input).unwrap(), + *expected, + "failed for {input:?}" + ); + } +} + +#[test] +fn maps_map_type_to_substrait() { + let map_type = DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Int64, true), + ])), + false, + )), + false, + ); + assert_eq!( + arrow_type_to_substrait(&map_type).unwrap(), + "map" + ); +} + +#[test] +fn maps_volatility_and_type_classes() { + assert_eq!( + volatility_to_substrait(&Volatility::Immutable), + (true, false) + ); + assert_eq!(volatility_to_substrait(&Volatility::Stable), (true, true)); + assert_eq!( + volatility_to_substrait(&Volatility::Volatile), + (false, false) + ); + + let cases: &[(&TypeSignatureClass, &str)] = &[ + (&TypeSignatureClass::Any, "any"), + (&TypeSignatureClass::Timestamp, "timestamp"), + (&TypeSignatureClass::Integer, "integer"), + (&TypeSignatureClass::Numeric, "numeric"), + (&TypeSignatureClass::Native(logical_float64()), "fp64"), + ]; + for (input, expected) in cases { + assert_eq!(type_class_to_substrait(input).unwrap(), *expected); + } +} + +#[test] +fn maps_signature_shapes_to_impls() { + let impls = scalar_impls(Signature::nullary(Volatility::Immutable), |_| { + Ok("i32".to_string()) + }); + assert!(impls[0].args.is_none()); + + let impls = scalar_impls( + Signature::exact(vec![DataType::Int32, DataType::Utf8], Volatility::Immutable), + |_| Ok("boolean".to_string()), + ); + assert_eq!(value_arg(&args(&impls[0])[0]).0.as_deref(), Some("x")); + assert_eq!(value_arg(&args(&impls[0])[0]).1, "i32"); + assert_eq!(value_arg(&args(&impls[0])[1]).1, "string"); + + let impls = scalar_impls(Signature::variadic_any(Volatility::Immutable), |_| { + Ok("any".to_string()) + }); + assert_eq!(value_arg(&args(&impls[0])[0]).1, "any"); + assert_eq!(impls[0].variadic.as_ref().unwrap().min, Some(1.0)); + + let impls = scalar_impls(Signature::any(2, Volatility::Immutable), |_| { + Ok("any".to_string()) + }); + assert_eq!(args(&impls[0]).len(), 2); + assert!(args(&impls[0]).iter().all(|a| value_arg(a).1 == "any")); + + let impls = scalar_impls(Signature::comparable(2, Volatility::Immutable), |_| { + Ok("any1".to_string()) + }); + assert!(args(&impls[0]).iter().all(|a| value_arg(a).1 == "any1")); + assert_eq!(return_type(&impls[0]), "any1"); + + let impls = scalar_impls(Signature::numeric(1, Volatility::Immutable), |args| { + arrow_type_to_substrait(&args[0]) + }); + assert_eq!(impls.len(), 10); + assert!(impls.iter().any(|i| value_arg(&args(i)[0]).1 == "i32")); + assert!(impls.iter().any(|i| value_arg(&args(i)[0]).1 == "fp64")); + + let impls = scalar_impls(Signature::string(1, Volatility::Immutable), |_| { + Ok("string".to_string()) + }); + assert_eq!(impls.len(), 1); + assert_eq!(value_arg(&args(&impls[0])[0]).1, "string"); + + let impls = scalar_impls( + Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Int32]), + TypeSignature::Exact(vec![DataType::Int64]), + ], + Volatility::Immutable, + ), + |args| arrow_type_to_substrait(&args[0]), + ); + assert_eq!(impls.len(), 2); + assert_eq!(value_arg(&args(&impls[0])[0]).1, "i32"); + assert_eq!(value_arg(&args(&impls[1])[0]).1, "i64"); +} + +#[test] +fn uses_configured_signature_and_return_type_overrides() { + let mut overrides = FunctionYamlOverrides::default(); + overrides.user_defined_signatures.insert( + "custom_udf".to_string(), + vec![FunctionSignatureOverride::variadic( + vec!["any1".to_string()], + "list", + 2, + FunctionVariadicConsistency::Consistent, + )], + ); + + let impls = signature_to_impls( + "custom_udf", + &Signature::user_defined(Volatility::Immutable), + &["value".to_string()], + &overrides, + |_| Ok("unused".to_string()), + ) + .unwrap(); + + assert_eq!(value_arg(&args(&impls[0])[0]).0.as_deref(), Some("value")); + assert_eq!(return_type(&impls[0]), "list"); + assert_eq!(impls[0].variadic.as_ref().unwrap().min, Some(2.0)); + assert_eq!( + impls[0].variadic.as_ref().unwrap().parameter_consistency, + Some(ext::VariadicBehaviorParameterConsistency::Consistent) + ); + + overrides + .return_types + .insert("custom_return".to_string(), "string".to_string()); + let return_type = + return_type_or_any("custom_return", &overrides, &[DataType::Int32], || { + Err(DataFusionError::Internal("dynamic return type".to_string())) + }) + .unwrap(); + assert_eq!(return_type, "string"); +} + +#[test] +fn emits_alias_metadata_and_detects_duplicate_impls() { + let metadata = metadata_from_aliases(&["bar".to_string()]); + assert_eq!(metadata["datafusion"]["aliases"][0], "bar"); + + let implementation = function_impl( + vec![arg_yaml(0, &[], "i32".to_string())], + None, + "i32".to_string(), + ); + let err = + detect_duplicate_impls("duplicate", &[implementation.clone(), implementation]) + .unwrap_err(); + assert!( + err.to_string() + .contains("duplicate Substrait implementation") + ); +} + +#[test] +fn maps_aggregate_metadata() { + let implementation = + aggregate_impl("sum", true, function_impl(vec![], None, "fp64".to_string())); + assert!(!implementation.ordered.unwrap().0); + assert_eq!(implementation.decomposable, Some(ext::Decomposable::Many)); + assert_eq!(type_name(&implementation.intermediate.unwrap().0), "fp64"); + assert_eq!(aggregate_decomposable("median"), ext::Decomposable::None); + assert_eq!(aggregate_decomposable("sum"), ext::Decomposable::Many); + assert_eq!(aggregate_intermediate_type("count", "i64"), "i64"); + assert_eq!(aggregate_intermediate_type("sum", "fp64"), "fp64"); +} + +#[test] +fn generated_extension_uses_configured_urn() { + let extension = generate_function_extension_for_inventory( + &FunctionYamlInventory { + scalar_functions: vec![], + higher_order_functions: vec![], + aggregate_functions: vec![], + window_functions: vec![], + }, + &FunctionYamlConfig { + urn: "extension:test".to_string(), + overrides: FunctionYamlOverrides::default(), + }, + ) + .unwrap(); + + assert_eq!(extension.urn, "extension:test"); + assert!(extension.scalar_functions.is_empty()); + assert!(extension.aggregate_functions.is_empty()); + assert!(extension.window_functions.is_empty()); +} diff --git a/datafusion/substrait/src/function_yaml/types.rs b/datafusion/substrait/src/function_yaml/types.rs new file mode 100644 index 0000000000000..c1707eb3a8dfc --- /dev/null +++ b/datafusion/substrait/src/function_yaml/types.rs @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; +use datafusion::common::types::LogicalType; +use datafusion::common::{Result, internal_err}; +use datafusion::logical_expr::{TypeSignatureClass, Volatility}; + +pub fn volatility_to_substrait(volatility: &Volatility) -> (bool, bool) { + match volatility { + Volatility::Immutable => (true, false), + Volatility::Stable => (true, true), + Volatility::Volatile => (false, false), + } +} + +pub fn type_class_to_substrait(class: &TypeSignatureClass) -> Result { + match class { + TypeSignatureClass::Any => Ok("any".to_string()), + TypeSignatureClass::Timestamp => Ok("timestamp".to_string()), + TypeSignatureClass::Time => Ok("time".to_string()), + TypeSignatureClass::Interval => Ok("interval".to_string()), + TypeSignatureClass::Duration => Ok("duration".to_string()), + TypeSignatureClass::Integer => Ok("integer".to_string()), + TypeSignatureClass::Float => Ok("float".to_string()), + TypeSignatureClass::Decimal => Ok("decimal".to_string()), + TypeSignatureClass::Numeric => Ok("numeric".to_string()), + TypeSignatureClass::Binary => Ok("binary".to_string()), + TypeSignatureClass::Native(logical_type) => { + let data_type = logical_type.native().default_cast_for(&DataType::Null)?; + arrow_type_to_substrait(&data_type) + } + } +} + +pub fn arrow_type_to_substrait(data_type: &DataType) -> Result { + let data_type = match data_type { + // Dict-encoded columns carry the same logical type as their value type. + DataType::Dictionary(_, value_type) => value_type.as_ref(), + other => other, + }; + + match data_type { + DataType::Null => Ok("any".to_string()), + DataType::Boolean => Ok("boolean".to_string()), + DataType::Int8 => Ok("i8".to_string()), + DataType::Int16 => Ok("i16".to_string()), + DataType::Int32 => Ok("i32".to_string()), + DataType::Int64 => Ok("i64".to_string()), + DataType::UInt8 => Ok("u8".to_string()), + DataType::UInt16 => Ok("u16".to_string()), + DataType::UInt32 => Ok("u32".to_string()), + DataType::UInt64 => Ok("u64".to_string()), + DataType::Float16 => Ok("fp16".to_string()), + DataType::Float32 => Ok("fp32".to_string()), + DataType::Float64 => Ok("fp64".to_string()), + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => { + Ok("string".to_string()) + } + DataType::Binary + | DataType::LargeBinary + | DataType::BinaryView + | DataType::FixedSizeBinary(_) => Ok("binary".to_string()), + DataType::Date32 | DataType::Date64 => Ok("date".to_string()), + DataType::Time32(unit) | DataType::Time64(unit) => { + Ok(format!("time<{}>", time_unit_name(unit))) + } + DataType::Timestamp(unit, timezone) => { + let name = if timezone.is_some() { + "timestamp_tz" + } else { + "timestamp" + }; + Ok(format!("{name}<{}>", time_unit_name(unit))) + } + DataType::Duration(unit) => Ok(format!("duration<{}>", time_unit_name(unit))), + DataType::Interval(unit) => Ok(match unit { + IntervalUnit::YearMonth => "interval_year_month", + IntervalUnit::DayTime => "interval_day_time", + IntervalUnit::MonthDayNano => "interval_month_day_nano", + } + .to_string()), + DataType::Decimal32(precision, scale) + | DataType::Decimal64(precision, scale) + | DataType::Decimal128(precision, scale) + | DataType::Decimal256(precision, scale) => { + Ok(format!("decimal<{precision},{scale}>")) + } + DataType::List(field) + | DataType::LargeList(field) + | DataType::ListView(field) + | DataType::LargeListView(field) => Ok(format!( + "list<{}>", + arrow_type_to_substrait(field.data_type())? + )), + DataType::FixedSizeList(field, size) => Ok(format!( + "fixed_size_list<{},{}>", + arrow_type_to_substrait(field.data_type())?, + size + )), + DataType::Struct(fields) => { + let field_types = fields + .iter() + .map(|field| arrow_type_to_substrait(field.data_type())) + .collect::>>()?; + Ok(format!("struct<{}>", field_types.join(","))) + } + DataType::Map(field, _) => { + // Arrow maps encode key/value as Struct in the inner field. + if let DataType::Struct(fields) = field.data_type() + && fields.len() == 2 + { + let key = arrow_type_to_substrait(fields[0].data_type())?; + let val = arrow_type_to_substrait(fields[1].data_type())?; + return Ok(format!("map<{key},{val}>")); + } + internal_err!("unexpected Map inner type: {:?}", field.data_type()) + } + other => { + internal_err!("unsupported Arrow DataType for Substrait YAML: {other:?}") + } + } +} + +fn time_unit_name(unit: &TimeUnit) -> &'static str { + match unit { + TimeUnit::Second => "s", + TimeUnit::Millisecond => "ms", + TimeUnit::Microsecond => "us", + TimeUnit::Nanosecond => "ns", + } +} diff --git a/datafusion/substrait/src/lib.rs b/datafusion/substrait/src/lib.rs index 0819fd3a592f9..207d9f19b3cbf 100644 --- a/datafusion/substrait/src/lib.rs +++ b/datafusion/substrait/src/lib.rs @@ -89,6 +89,7 @@ //! # } //! ``` pub mod extensions; +pub mod function_yaml; pub mod logical_plan; #[cfg(feature = "physical")] pub mod physical_plan;