From 22df70a0caf888084ae13d7f3e0178a477d1e7d7 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 2 Oct 2023 22:51:50 +0800 Subject: [PATCH] GH-37799: [C++] Compute: CommonTemporal support time32 and time64 casting (#37949) ### Rationale for this change The original problem in mentioned in https://github.com/apache/arrow/issues/37799 1. `SECOND` in Parquet would always cast to `MILLS` 2. `eq` not support `eq(time32[s], time32[ms)` This patch is as advice in https://github.com/apache/arrow/issues/37799#issuecomment-1737747352 . We tent to add time32 and time64 in `CommonTemporal`. ### What changes are included in this PR? Support time32 and time64 with different time unit in `arrow::compute::internal::CommonTemporal`. ### Are these changes tested? Yes ### Are there any user-facing changes? bugfix * Closes: #37799 Authored-by: mwish Signed-off-by: Benjamin Kietzman --- .../arrow/compute/kernels/codegen_internal.cc | 49 +++++++++++++++---- .../compute/kernels/codegen_internal_test.cc | 12 +++++ cpp/src/arrow/dataset/file_parquet_test.cc | 25 ++++++++++ 3 files changed, 77 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 8e2669bd3dfb9..00a833742f957 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -251,6 +251,7 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t count) { bool saw_date32 = false; bool saw_date64 = false; bool saw_duration = false; + bool saw_time_since_midnight = false; const TypeHolder* end = begin + count; for (auto it = begin; it != end; it++) { auto id = it->type->id(); @@ -271,6 +272,18 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t count) { finest_unit = std::max(finest_unit, ty.unit()); continue; } + case Type::TIME32: { + const auto& type = checked_cast(*it->type); + finest_unit = std::max(finest_unit, type.unit()); + saw_time_since_midnight = true; + continue; + } + case Type::TIME64: { + const auto& type = checked_cast(*it->type); + finest_unit = std::max(finest_unit, type.unit()); + saw_time_since_midnight = true; + continue; + } case Type::DURATION: { const auto& ty = checked_cast(*it->type); finest_unit = std::max(finest_unit, ty.unit()); @@ -282,15 +295,33 @@ TypeHolder CommonTemporal(const TypeHolder* begin, size_t count) { } } - if (timezone) { - // At least one timestamp seen - return timestamp(finest_unit, *timezone); - } else if (saw_date64) { - return date64(); - } else if (saw_date32) { - return date32(); - } else if (saw_duration) { - return duration(finest_unit); + bool saw_timestamp_or_date = timezone || saw_date64 || saw_date32 || saw_duration; + + if (saw_time_since_midnight && saw_timestamp_or_date) { + // Cannot find common type + return TypeHolder(nullptr); + } + if (saw_timestamp_or_date) { + if (timezone) { + // At least one timestamp seen + return timestamp(finest_unit, *timezone); + } else if (saw_date64) { + return date64(); + } else if (saw_date32) { + return date32(); + } else if (saw_duration) { + return duration(finest_unit); + } + } + if (saw_time_since_midnight) { + switch (finest_unit) { + case TimeUnit::SECOND: + case TimeUnit::MILLI: + return time32(finest_unit); + case TimeUnit::MICRO: + case TimeUnit::NANO: + return time64(finest_unit); + } } return TypeHolder(nullptr); } diff --git a/cpp/src/arrow/compute/kernels/codegen_internal_test.cc b/cpp/src/arrow/compute/kernels/codegen_internal_test.cc index af024fb8d6e08..6bb5568d2ff38 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal_test.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal_test.cc @@ -159,6 +159,18 @@ TEST(TestDispatchBest, CommonTemporal) { args = {timestamp(TimeUnit::SECOND, "America/Phoenix"), timestamp(TimeUnit::SECOND, "UTC")}; ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr); + + args = {time32(TimeUnit::SECOND), time32(TimeUnit::MILLI)}; + AssertTypeEqual(*time32(TimeUnit::MILLI), *CommonTemporal(args.data(), args.size())); + + args = {time32(TimeUnit::SECOND), time64(TimeUnit::NANO)}; + AssertTypeEqual(*time64(TimeUnit::NANO), *CommonTemporal(args.data(), args.size())); + + args = {date32(), time32(TimeUnit::SECOND)}; + ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr); + + args = {timestamp(TimeUnit::SECOND), time32(TimeUnit::SECOND)}; + ASSERT_EQ(CommonTemporal(args.data(), args.size()), nullptr); } TEST(TestDispatchBest, CommonTemporalResolution) { diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 177ca824179a8..dc9e085df3c4c 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -730,6 +730,31 @@ TEST_P(TestParquetFileFormatScan, PredicatePushdownRowGroupFragmentsUsingDuratio CountRowGroupsInFragment(fragment, {0}, expr); } +TEST_P(TestParquetFileFormatScan, + PredicatePushdownRowGroupFragmentsUsingTimestampColumn) { + // GH-37799: Parquet arrow will change TimeUnit::SECOND to TimeUnit::MILLI + // because parquet LogicalType doesn't support SECOND. + for (auto time_unit : {TimeUnit::MILLI, TimeUnit::SECOND}) { + auto table = TableFromJSON(schema({field("t", time32(time_unit))}), + { + R"([{"t": 1}])", + R"([{"t": 2}, {"t": 3}])", + }); + TableBatchReader table_reader(*table); + ARROW_SCOPED_TRACE("time_unit=", time_unit); + ASSERT_OK_AND_ASSIGN( + auto source, + ParquetFormatHelper::Write( + &table_reader, ArrowWriterProperties::Builder().store_schema()->build()) + .As()); + SetSchema({field("t", time32(time_unit))}); + ASSERT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(source)); + + auto expr = equal(field_ref("t"), literal(::arrow::Time32Scalar(1, time_unit))); + CountRowGroupsInFragment(fragment, {0}, expr); + } +} + // Tests projection with nested/indexed FieldRefs. // https://github.com/apache/arrow/issues/35579 TEST_P(TestParquetFileFormatScan, ProjectWithNonNamedFieldRefs) {