Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-13033: [C++] Kernel to localize naive timestamps to a timezone (preserving clock-time) #10610

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions cpp/src/arrow/compute/api_scalar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,44 @@ struct EnumTraits<compute::CompareOperator>
return "<INVALID>";
}
};
template <>
struct EnumTraits<compute::AssumeTimezoneOptions::Ambiguous>
: BasicEnumTraits<compute::AssumeTimezoneOptions::Ambiguous,
compute::AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_RAISE,
compute::AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_EARLIEST,
compute::AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_LATEST> {
static std::string name() { return "AssumeTimezoneOptions::Ambiguous"; }
static std::string value_name(compute::AssumeTimezoneOptions::Ambiguous value) {
switch (value) {
case compute::AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_RAISE:
return "AMBIGUOUS_RAISE";
case compute::AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_EARLIEST:
return "AMBIGUOUS_EARLIEST";
case compute::AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_LATEST:
return "AMBIGUOUS_LATEST";
}
return "<INVALID>";
}
};
template <>
struct EnumTraits<compute::AssumeTimezoneOptions::Nonexistent>
: BasicEnumTraits<compute::AssumeTimezoneOptions::Nonexistent,
compute::AssumeTimezoneOptions::Nonexistent::NONEXISTENT_RAISE,
compute::AssumeTimezoneOptions::Nonexistent::NONEXISTENT_EARLIEST,
compute::AssumeTimezoneOptions::Nonexistent::NONEXISTENT_LATEST> {
static std::string name() { return "AssumeTimezoneOptions::Nonexistent"; }
static std::string value_name(compute::AssumeTimezoneOptions::Nonexistent value) {
switch (value) {
case compute::AssumeTimezoneOptions::Nonexistent::NONEXISTENT_RAISE:
return "NONEXISTENT_RAISE";
case compute::AssumeTimezoneOptions::Nonexistent::NONEXISTENT_EARLIEST:
return "NONEXISTENT_EARLIEST";
case compute::AssumeTimezoneOptions::Nonexistent::NONEXISTENT_LATEST:
return "NONEXISTENT_LATEST";
}
return "<INVALID>";
}
};
} // namespace internal

namespace compute {
Expand Down Expand Up @@ -147,6 +185,10 @@ static auto kStrptimeOptionsType = GetFunctionOptionsType<StrptimeOptions>(
DataMember("unit", &StrptimeOptions::unit));
static auto kStrftimeOptionsType = GetFunctionOptionsType<StrftimeOptions>(
DataMember("format", &StrftimeOptions::format));
static auto kAssumeTimezoneOptionsType = GetFunctionOptionsType<AssumeTimezoneOptions>(
DataMember("timezone", &AssumeTimezoneOptions::timezone),
DataMember("ambiguous", &AssumeTimezoneOptions::ambiguous),
DataMember("nonexistent", &AssumeTimezoneOptions::nonexistent));
static auto kPadOptionsType = GetFunctionOptionsType<PadOptions>(
DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding));
static auto kTrimOptionsType = GetFunctionOptionsType<TrimOptions>(
Expand Down Expand Up @@ -250,6 +292,15 @@ StrftimeOptions::StrftimeOptions() : StrftimeOptions(kDefaultFormat) {}
constexpr char StrftimeOptions::kTypeName[];
constexpr const char* StrftimeOptions::kDefaultFormat;

AssumeTimezoneOptions::AssumeTimezoneOptions(std::string timezone, Ambiguous ambiguous,
Nonexistent nonexistent)
: FunctionOptions(internal::kAssumeTimezoneOptionsType),
timezone(std::move(timezone)),
ambiguous(ambiguous),
nonexistent(nonexistent) {}
AssumeTimezoneOptions::AssumeTimezoneOptions() : AssumeTimezoneOptions("UTC") {}
constexpr char AssumeTimezoneOptions::kTypeName[];

PadOptions::PadOptions(int64_t width, std::string padding)
: FunctionOptions(internal::kPadOptionsType),
width(width),
Expand Down Expand Up @@ -311,6 +362,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kStrftimeOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kAssumeTimezoneOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType));
DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType));
Expand Down Expand Up @@ -512,6 +564,11 @@ Result<Datum> DayOfWeek(const Datum& arg, DayOfWeekOptions options, ExecContext*
return CallFunction("day_of_week", {arg}, &options, ctx);
}

Result<Datum> AssumeTimezone(const Datum& arg, AssumeTimezoneOptions options,
ExecContext* ctx) {
return CallFunction("assume_timezone", {arg}, &options, ctx);
}

Result<Datum> Strftime(const Datum& arg, StrftimeOptions options, ExecContext* ctx) {
return CallFunction("strftime", {arg}, &options, ctx);
}
Expand Down
51 changes: 51 additions & 0 deletions cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "arrow/result.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
#include "arrow/vendored/datetime.h"

namespace arrow {
namespace compute {
Expand Down Expand Up @@ -278,6 +279,40 @@ struct ARROW_EXPORT DayOfWeekOptions : public FunctionOptions {
uint32_t week_start;
};

/// Used to control timestamp timezone conversion and handling ambiguous/nonexistent
/// times.
struct ARROW_EXPORT AssumeTimezoneOptions : public FunctionOptions {
public:
/// \brief How to interpret ambiguous local times that can be interpreted as
/// multiple instants (normally two) due to DST shifts.
///
/// AMBIGUOUS_EARLIEST emits the earliest instant amongst possible interpretations.
/// AMBIGUOUS_LATEST emits the latest instant amongst possible interpretations.
enum Ambiguous { AMBIGUOUS_RAISE, AMBIGUOUS_EARLIEST, AMBIGUOUS_LATEST };

/// \brief How to handle local times that do not exist due to DST shifts.
///
/// NONEXISTENT_EARLIEST emits the instant "just before" the DST shift instant
/// in the given timestamp precision (for example, for a nanoseconds precision
/// timestamp, this is one nanosecond before the DST shift instant).
/// NONEXISTENT_LATEST emits the DST shift instant.
enum Nonexistent { NONEXISTENT_RAISE, NONEXISTENT_EARLIEST, NONEXISTENT_LATEST };
pitrou marked this conversation as resolved.
Show resolved Hide resolved

explicit AssumeTimezoneOptions(std::string timezone,
Ambiguous ambiguous = AMBIGUOUS_RAISE,
Nonexistent nonexistent = NONEXISTENT_RAISE);
AssumeTimezoneOptions();
constexpr static char const kTypeName[] = "AssumeTimezoneOptions";

/// Timezone to convert timestamps from
std::string timezone;

/// How to interpret ambiguous local times (due to DST shifts)
Ambiguous ambiguous;
/// How to interpret non-existent local times (due to DST shifts)
Nonexistent nonexistent;
};

/// @}

/// \brief Get the absolute value of a value.
Expand Down Expand Up @@ -1025,5 +1060,21 @@ ARROW_EXPORT Result<Datum> Subsecond(const Datum& values, ExecContext* ctx = NUL
ARROW_EXPORT Result<Datum> Strftime(const Datum& values, StrftimeOptions options,
ExecContext* ctx = NULLPTR);

/// \brief Converts timestamps from local timestamp without a timezone to a timestamp with
/// timezone, interpreting the local timestamp as being in the specified timezone for each
/// element of `values`
///
/// \param[in] values input to convert
/// \param[in] options for setting source timezone, exception and ambiguous timestamp
/// handling.
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 6.0.0
/// \note API not yet finalized
ARROW_EXPORT Result<Datum> AssumeTimezone(const Datum& values,
AssumeTimezoneOptions options,
ExecContext* ctx = NULLPTR);

} // namespace compute
} // namespace arrow
12 changes: 9 additions & 3 deletions cpp/src/arrow/compute/function_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,18 @@
// specific language governing permissions and limitations
// under the License.

#include "arrow/compute/function.h"

#include <gtest/gtest.h>

#include <memory>
#include <string>
#include <vector>

#include <gtest/gtest.h>

#include "arrow/compute/api_aggregate.h"
#include "arrow/compute/api_scalar.h"
#include "arrow/compute/api_vector.h"
#include "arrow/compute/cast.h"
#include "arrow/compute/function.h"
#include "arrow/compute/kernel.h"
#include "arrow/datum.h"
#include "arrow/status.h"
Expand Down Expand Up @@ -80,6 +81,11 @@ TEST(FunctionOptions, Equality) {
options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::MILLI));
options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::NANO));
options.emplace_back(new StrftimeOptions("%Y-%m-%dT%H:%M:%SZ", "C"));
#ifndef _WIN32
options.emplace_back(new AssumeTimezoneOptions(
"Europe/Amsterdam", AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_RAISE,
AssumeTimezoneOptions::Nonexistent::NONEXISTENT_RAISE));
#endif
options.emplace_back(new PadOptions(5, " "));
options.emplace_back(new PadOptions(10, "A"));
options.emplace_back(new TrimOptions(" "));
Expand Down
11 changes: 7 additions & 4 deletions cpp/src/arrow/compute/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,9 +290,11 @@ class ARROW_EXPORT OutputType {
enum ResolveKind { FIXED, COMPUTED };

/// Type resolution function. Given input types and shapes, return output
/// type and shape. This function SHOULD _not_ be used to check for arity,
/// that is to be performed one or more layers above. May make use of kernel
/// state to know what type to output in some cases.
/// type and shape. This function MAY may use the kernel state to decide
/// the output type based on the functionoptions.
///
/// This function SHOULD _not_ be used to check for arity, that is to be
/// performed one or more layers above.
using Resolver =
std::function<Result<ValueDescr>(KernelContext*, const std::vector<ValueDescr>&)>;

Expand All @@ -304,7 +306,8 @@ class ARROW_EXPORT OutputType {
/// \brief Output the exact type and shape provided by a ValueDescr
OutputType(ValueDescr descr); // NOLINT implicit construction

explicit OutputType(Resolver resolver)
/// \brief Output a computed type depending on actual input types
OutputType(Resolver resolver) // NOLINT implicit construction
: kind_(COMPUTED), resolver_(std::move(resolver)) {}

OutputType(const OutputType& other) {
Expand Down
Loading