[GLUTEN-4306][CH]Fix CI Failure of Cast Function (#4337)

* improve multi if * improve cast * remove unused code * improve cast * improve cast * Update SparkFunctionCastFloatToInt.cpp re-run ci
apache · Jan 11, 2024 · a3c690a · a3c690a
1 parent 40f72a5
commit a3c690a
Show file tree

Hide file tree

Showing 6 changed files with 277 additions and 22 deletions.
diff --git a/...ickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseTPCHParquetSuite.scala b/...ickhouse/src/test/scala/io/glutenproject/execution/GlutenClickHouseTPCHParquetSuite.scala
@@ -2056,6 +2056,16 @@ class GlutenClickHouseTPCHParquetSuite extends GlutenClickHouseTPCHAbstractSuite
     compareResultsAgainstVanillaSpark(sql, true, { _ => })
   }
 
+  test("GLUTEN-3149: Fix convert exception of Inf to int") {
+    val tbl_create_sql = "create table test_tbl_3149(a int, b int) using parquet";
+    val tbl_insert_sql = "insert into test_tbl_3149 values(1, 0)"
+    val select_sql = "select cast(a * 1.0f/b as int) as x from test_tbl_3149 where a = 1"
+    spark.sql(tbl_create_sql)
+    spark.sql(tbl_insert_sql);
+    compareResultsAgainstVanillaSpark(select_sql, true, { _ => })
+    spark.sql("drop table test_tbl_3149")
+  }
+
   test("test in-filter contains null value (bigint)") {
     val sql = "select s_nationkey from supplier where s_nationkey in (null, 1, 2)"
     compareResultsAgainstVanillaSpark(sql, true, { _ => })

diff --git a/cpp-ch/local-engine/Functions/SparkFunctionCastFloatToInt.cpp b/cpp-ch/local-engine/Functions/SparkFunctionCastFloatToInt.cpp
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <base/types.h>
+#include <Functions/SparkFunctionCastFloatToInt.h>
+
+using namespace DB;
+
+namespace local_engine
+{
+
+struct NameToUInt8 { static constexpr auto name = "sparkCastFloatToUInt8"; };
+struct NameToUInt16 { static constexpr auto name = "sparkCastFloatToUInt16"; };
+struct NameToUInt32 { static constexpr auto name = "sparkCastFloatToUInt32"; };
+struct NameToUInt64 { static constexpr auto name = "sparkCastFloatToUInt64"; };
+struct NameToUInt128 { static constexpr auto name = "sparkCastFloatToUInt128"; };
+struct NameToUInt256 { static constexpr auto name = "sparkCastFloatToUInt256"; };
+struct NameToInt8 { static constexpr auto name = "sparkCastFloatToInt8"; };
+struct NameToInt16 { static constexpr auto name = "sparkCastFloatToInt16"; };
+struct NameToInt32 { static constexpr auto name = "sparkCastFloatToInt32"; };
+struct NameToInt64 { static constexpr auto name = "sparkCastFloatToInt64"; };
+struct NameToInt128 { static constexpr auto name = "sparkCastFloatToInt128"; };
+struct NameToInt256 { static constexpr auto name = "sparkCastFloatToInt256"; };
+
+using SparkFunctionCastFloatToInt8 = local_engine::SparkFunctionCastFloatToInt<Int8, NameToInt8>;
+using SparkFunctionCastFloatToInt16 = local_engine::SparkFunctionCastFloatToInt<Int16, NameToInt16>;
+using SparkFunctionCastFloatToInt32 = local_engine::SparkFunctionCastFloatToInt<Int32, NameToInt32>;
+using SparkFunctionCastFloatToInt64 = local_engine::SparkFunctionCastFloatToInt<Int64, NameToInt64>;
+using SparkFunctionCastFloatToInt128 = local_engine::SparkFunctionCastFloatToInt<Int128, NameToInt128>;
+using SparkFunctionCastFloatToInt256 = local_engine::SparkFunctionCastFloatToInt<Int256, NameToInt256>;
+using SparkFunctionCastFloatToUInt8 = local_engine::SparkFunctionCastFloatToInt<UInt8, NameToUInt8>;
+using SparkFunctionCastFloatToUInt16 = local_engine::SparkFunctionCastFloatToInt<UInt16, NameToUInt16>;
+using SparkFunctionCastFloatToUInt32 = local_engine::SparkFunctionCastFloatToInt<UInt32, NameToUInt32>;
+using SparkFunctionCastFloatToUInt64 = local_engine::SparkFunctionCastFloatToInt<UInt64, NameToUInt64>;
+using SparkFunctionCastFloatToUInt128 = local_engine::SparkFunctionCastFloatToInt<UInt128, NameToUInt128>;
+using SparkFunctionCastFloatToUInt256 = local_engine::SparkFunctionCastFloatToInt<UInt256, NameToUInt256>;
+
+REGISTER_FUNCTION(SparkFunctionCastToInt)
+{
+    factory.registerFunction<SparkFunctionCastFloatToInt8>();
+    factory.registerFunction<SparkFunctionCastFloatToInt16>();
+    factory.registerFunction<SparkFunctionCastFloatToInt32>();
+    factory.registerFunction<SparkFunctionCastFloatToInt64>();
+    factory.registerFunction<SparkFunctionCastFloatToInt128>();
+    factory.registerFunction<SparkFunctionCastFloatToInt256>();
+    factory.registerFunction<SparkFunctionCastFloatToUInt8>();
+    factory.registerFunction<SparkFunctionCastFloatToUInt16>();
+    factory.registerFunction<SparkFunctionCastFloatToUInt32>();
+    factory.registerFunction<SparkFunctionCastFloatToUInt64>();
+    factory.registerFunction<SparkFunctionCastFloatToUInt128>();
+    factory.registerFunction<SparkFunctionCastFloatToUInt256>();
+}
+}
diff --git a/cpp-ch/local-engine/Functions/SparkFunctionCastFloatToInt.h b/cpp-ch/local-engine/Functions/SparkFunctionCastFloatToInt.h
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <Common/NaNUtils.h>
+#include <DataTypes/IDataType.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Functions/IFunction.h>
+#include <Functions/FunctionFactory.h>
+#include <Columns/ColumnsNumber.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnNullable.h>
+
+using namespace DB;
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int TYPE_MISMATCH;
+}
+}
+
+namespace local_engine
+{
+
+template <typename T, typename Name>
+class SparkFunctionCastFloatToInt : public DB::IFunction
+{
+public:
+    size_t getNumberOfArguments() const override { return 1; }
+    static constexpr auto name = Name::name;
+    static DB::FunctionPtr create(DB::ContextPtr) { return std::make_shared<SparkFunctionCastFloatToInt>(); }
+    SparkFunctionCastFloatToInt() = default;
+    ~SparkFunctionCastFloatToInt() override = default;
+    DB::String getName() const override { return name; }
+    bool useDefaultImplementationForConstants() const override { return true; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
+
+    DB::DataTypePtr getReturnTypeImpl(const DB::DataTypes &) const override
+    {
+        if constexpr (std::is_integral_v<T>)
+        {
+            return DB::makeNullable(std::make_shared<const DB::DataTypeNumber<T>>());
+        }
+        else
+            throw DB::Exception(DB::ErrorCodes::TYPE_MISMATCH, "Function {}'s return type should be Int", name);
+    }
+
+    DB::ColumnPtr executeImpl(const DB::ColumnsWithTypeAndName & arguments, const DB::DataTypePtr & result_type, size_t) const override
+    {
+        if (arguments.size() != 1)
+            throw DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {}'s arguments number must be 1", name);
+
+        if (!isFloat(removeNullable(arguments[0].type)))
+            throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {}'s 1st argument must be float type", name);
+
+        DB::ColumnPtr src_col = arguments[0].column;
+        size_t size = src_col->size();
+
+        auto res_col = DB::ColumnVector<T>::create(size);
+        auto null_map_col = DB::ColumnUInt8::create(size, 0);
+
+        switch(removeNullable(arguments[0].type)->getTypeId())
+        {
+            case DB::TypeIndex::Float32:
+            {
+                executeInternal<DB::Float32>(src_col, res_col->getData(), null_map_col->getData());
+                break;
+            }
+            case DB::TypeIndex::Float64:
+            {
+                executeInternal<DB::Float64>(src_col, res_col->getData(), null_map_col->getData());
+                break;
+            }
+        }
+        return DB::ColumnNullable::create(std::move(res_col), std::move(null_map_col));
+    }
+
+    template <typename F>
+    void executeInternal(const DB::ColumnPtr & src, DB::PaddedPODArray<T> & data, DB::PaddedPODArray<UInt8> & null_map_data) const
+    {
+        const DB::ColumnVector<F> * src_vec = assert_cast<const DB::ColumnVector<F> *>(src.get());
+        for (size_t i = 0; i < src_vec->size(); ++i)
+        {
+            F element = src_vec->getElement(i);
+            if (isNaN(element) || !isFinite(element))
+            {
+                data[i] = 0;
+                null_map_data[i] = 1;
+            }
+            else
+                data[i] = static_cast<T>(element);
+        }
+    }
+
+};
+
+}
diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp b/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp
@@ -1017,7 +1017,6 @@ const ActionsDAG::Node * SerializedPlanParser::parseFunctionWithDAG(
         result_name = ch_func_name + "(" + args_name + ")";
         const auto * function_node = &actions_dag->addFunction(function_builder, args, result_name);
         result_node = function_node;
-
         if (!TypeParser::isTypeMatched(rel.scalar_function().output_type(), function_node->result_type) && !converted_decimal_args)
         {
             auto result_type = TypeParser::parseType(rel.scalar_function().output_type());
@@ -1643,7 +1642,6 @@ const ActionsDAG::Node * SerializedPlanParser::parseExpression(ActionsDAGPtr act
             args.emplace_back(parseExpression(actions_dag, input));
 
             const auto & substrait_type = rel.cast().type();
-            auto to_ch_type = TypeParser::parseType(substrait_type);
             const ActionsDAG::Node * function_node = nullptr;
             if (DB::isString(DB::removeNullable(args.back()->result_type)) && substrait_type.has_date())
             {
@@ -1662,23 +1660,6 @@ const ActionsDAG::Node * SerializedPlanParser::parseExpression(ActionsDAGPtr act
                 // Spark cast(x as BINARY) -> CH reinterpretAsStringSpark(x)
                 function_node = toFunctionNode(actions_dag, "reinterpretAsStringSpark", args);
             }
-            else if (DB::isFloat(DB::removeNullable(args[0]->result_type)) && DB::isNativeInteger(DB::removeNullable(to_ch_type)))
-            {
-                /// It looks like by design in CH that forbids cast NaN/Inf to integer.
-                auto zero_node = addColumn(actions_dag, args[0]->result_type, 0.0);
-                const auto * if_not_finite_node = toFunctionNode(actions_dag, "ifNotFinite", {args[0], zero_node});
-                const auto * final_arg_node = if_not_finite_node;
-                if (args[0]->result_type->isNullable())
-                {
-                    DB::Field null_field;
-                    const auto * null_value = addColumn(actions_dag, args[0]->result_type, null_field);
-                    const auto * is_null_node = toFunctionNode(actions_dag, "isNull", {args[0]});
-                    const auto * if_node = toFunctionNode(actions_dag, "if", {is_null_node, null_value, if_not_finite_node});
-                    final_arg_node = if_node;
-                }
-                function_node = toFunctionNode(
-                    actions_dag, "CAST", {final_arg_node, addColumn(actions_dag, std::make_shared<DataTypeString>(), to_ch_type->getName())});
-            }
             else
             {
                 DataTypePtr ch_type = TypeParser::parseType(substrait_type);
@@ -1690,8 +1671,16 @@ const ActionsDAG::Node * SerializedPlanParser::parseExpression(ActionsDAGPtr act
                 }
                 else
                 {
-                    args.emplace_back(addColumn(actions_dag, std::make_shared<DataTypeString>(), ch_type->getName()));
-                    function_node = toFunctionNode(actions_dag, "CAST", args);
+                    if (isFloat(DB::removeNullable(args[0]->result_type)) && isInt(DB::removeNullable(ch_type)))
+                    {
+                        String function_name = "sparkCastFloatTo" + DB::removeNullable(ch_type)->getName();
+                        function_node = toFunctionNode(actions_dag, function_name, args);
+                    }
+                    else
+                    {
+                        args.emplace_back(addColumn(actions_dag, std::make_shared<DataTypeString>(), ch_type->getName()));
+                        function_node = toFunctionNode(actions_dag, "CAST", args);
+                    }
                 }
             }
 

diff --git a/cpp-ch/local-engine/tests/CMakeLists.txt b/cpp-ch/local-engine/tests/CMakeLists.txt
@@ -71,6 +71,7 @@ else()
         benchmark_parquet_read.cpp
         benchmark_spark_row.cpp
         benchmark_unix_timestamp_function.cpp
-        benchmark_spark_floor_function.cpp)
+        benchmark_spark_floor_function.cpp
+        benchmark_cast_float_function.cpp)
     target_link_libraries(benchmark_local_engine PRIVATE gluten_clickhouse_backend_libs ch_contrib::gbenchmark_all loggers ch_parquet)
 endif()
diff --git a/cpp-ch/local-engine/tests/benchmark_cast_float_function.cpp b/cpp-ch/local-engine/tests/benchmark_cast_float_function.cpp
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <Core/Block.h>
+#include <Columns/IColumn.h>
+#include <DataTypes/IDataType.h>
+#include <DataTypes/DataTypeFactory.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionsConversion.h>
+#include <Functions/SparkFunctionCastFloatToInt.h>
+#include <Parser/SerializedPlanParser.h>
+#include <benchmark/benchmark.h>
+
+using namespace DB;
+
+static Block createDataBlock(size_t rows)
+{
+    auto type = DataTypeFactory::instance().get("Float64");
+    auto column = type->createColumn();
+    for (size_t i = 0; i < rows; ++i)
+    {
+        column->insert(i * 1.0f);
+    }
+    Block block;
+    block.insert(ColumnWithTypeAndName(std::move(column), type, "d"));
+    return std::move(block);
+}
+
+static void BM_CHCastFloatToInt(benchmark::State & state)
+{
+    using namespace DB;
+    auto & factory = FunctionFactory::instance();
+    auto function = factory.get("CAST", local_engine::SerializedPlanParser::global_context);
+    Block block = createDataBlock(30000000);
+    DB::ColumnsWithTypeAndName args;
+    args.emplace_back(block.getColumnsWithTypeAndName()[0]);
+    DB::ColumnWithTypeAndName type_name_col;
+    type_name_col.name = "Int64";
+    type_name_col.column = DB::DataTypeString().createColumnConst(0, type_name_col.name);
+    type_name_col.type = std::make_shared<DB::DataTypeString>();
+    args.emplace_back(type_name_col);
+    auto executable = function->build(args);
+    for (auto _ : state)[[maybe_unused]]
+        auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows());
+}
+
+static void BM_SparkCastFloatToInt(benchmark::State & state)
+{
+    using namespace DB;
+    auto & factory = FunctionFactory::instance();
+    auto function = factory.get("sparkCastFloatToInt64", local_engine::SerializedPlanParser::global_context);
+    Block block = createDataBlock(30000000);
+    auto executable = function->build(block.getColumnsWithTypeAndName());
+    for (auto _ : state)[[maybe_unused]]
+         auto result = executable->execute(block.getColumnsWithTypeAndName(), executable->getResultType(), block.rows());
+}
+
+BENCHMARK(BM_CHCastFloatToInt)->Unit(benchmark::kMillisecond)->Iterations(10);
+BENCHMARK(BM_SparkCastFloatToInt)->Unit(benchmark::kMillisecond)->Iterations(10);