Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-16516: [R] Implement ym() my() and yq() parsers #13163

Closed
wants to merge 11 commits into from
24 changes: 23 additions & 1 deletion r/R/dplyr-datetime-helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,27 @@ build_formats <- function(orders) {
orders <- gsub("[^A-Za-z_]", "", orders)
orders <- gsub("Y", "y", orders)

# we need a different logic in order to deal with "ym', "my", and "yq" orders
# we separate them from the rest of the `orders` vector and transform them.
# `ym` and `yq` become `ymd` & `my` becomes `myd`
# this is needed because strptime does not parse "2022-05", so we add "-01",
# thus changing the format, and for equivalence with lubridate, which parses
# `ym` to the first day of the month
dragosmg marked this conversation as resolved.
Show resolved Hide resolved
short_orders <- c("ym", "my")

if (any(orders %in% short_orders)) {
thisisnic marked this conversation as resolved.
Show resolved Hide resolved
orders1 <- setdiff(orders, short_orders)
orders2 <- intersect(orders, short_orders)
orders2 <- paste0(orders2, "d")
orders <- unique(c(orders1, orders2))
}

if (any(orders == "yq")) {
orders1 <- setdiff(orders, "yq")
orders2 <- "ymd"
dragosmg marked this conversation as resolved.
Show resolved Hide resolved
orders <- unique(c(orders1, orders2))
}

supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
unsupported_passed_orders <- setdiff(orders, supported_orders)
supported_passed_orders <- intersect(orders, supported_orders)
Expand All @@ -176,7 +197,8 @@ build_formats <- function(orders) {
}

formats_list <- map(orders, build_format_from_order)
purrr::flatten_chr(formats_list)
formats <- purrr::flatten_chr(formats_list)
dragosmg marked this conversation as resolved.
Show resolved Hide resolved
unique(formats)
}

build_format_from_order <- function(order) {
Expand Down
62 changes: 61 additions & 1 deletion r/R/dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -493,11 +493,36 @@ register_bindings_datetime_parsers <- function() {
# each order is translated into possible formats
formats <- build_formats(orders)

x <- x$cast(string())

# make all separators (non-letters and non-numbers) into "-"
x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x)
# collapse multiple separators into a single one
x <- call_binding("gsub", "-{2,}", "-", x)

# we need to transform `x` when orders are `ym`, `my`, and `yq`
# for `ym` and `my` orders we add a day ("01")
augmented_x <- NULL
if (any(orders %in% c("ym", "my"))) {
augmented_x <- call_binding("paste0", x, "-01")
}

# for `yq` we need to transform the quarter into the start month (lubridate
# behaviour) and then add 01 to parse to the first day of the quarter
augmented_x2 <- NULL
if (any(orders == "yq")) {
# extract everything that comes after the `-` separator, i.e. the quarter
# (e.g. 4 from 2022-4)
quarter_x <- call_binding("gsub", "^.*?-", "", x)
dragosmg marked this conversation as resolved.
Show resolved Hide resolved
# we should probably error if quarter is not in 1:4
# extract everything that comes before the `-`, i.e. the year (e.g. 2002
# in 2002-4)
year_x <- call_binding("gsub", "-.*$", "", x)
quarter_x <- quarter_x$cast(int32())
month_x <- (quarter_x - 1) * 3 + 1
augmented_x2 <- call_binding("paste0", year_x, "-", month_x, "-01")
}

# TODO figure out how to parse strings that have no separators
# https://issues.apache.org/jira/browse/ARROW-16446
# we could insert separators at the "likely" positions, but it might be
Expand All @@ -514,6 +539,41 @@ register_bindings_datetime_parsers <- function() {
)
}

# build separate expression lists of parsing attempts for the orders that
# need an augmented `x`
# list for attempts when orders %in% c("ym", "my")
parse_attempt_exp_augmented_x <- list()

if (!is.null(augmented_x)) {
for (i in seq_along(formats)) {
dragosmg marked this conversation as resolved.
Show resolved Hide resolved
parse_attempt_expressions[[i]] <- build_expr(
"strptime",
augmented_x,
options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE)
)
}
}

# list for attempts when orders %in% c("yq")
parse_attempt_exp_augmented_x2 <- list()

if (!is.null(augmented_x2)) {
for (i in seq_along(formats)) {
dragosmg marked this conversation as resolved.
Show resolved Hide resolved
parse_attempt_expressions[[i]] <- build_expr(
"strptime",
augmented_x2,
options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE)
)
}
}

# combine all attempts expressions in prep for coalesce
parse_attempt_expressions <- c(
dragosmg marked this conversation as resolved.
Show resolved Hide resolved
parse_attempt_expressions,
parse_attempt_exp_augmented_x,
parse_attempt_exp_augmented_x2
)

coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions)

# we need this binding to be able to handle a NULL `tz`, which will then be
Expand All @@ -527,7 +587,7 @@ register_bindings_datetime_parsers <- function() {

})

ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym", "ym", "my", "yq")

ymd_parser_map_factory <- function(order) {
force(order)
Expand Down
34 changes: 33 additions & 1 deletion r/tests/testthat/test-dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -1735,7 +1735,7 @@ test_that("parse_date_time() doesn't work with hour, minutes, and second compone
)
})

test_that("year, month, day date/time parsers work", {
test_that("year, month, day date/time parsers", {
test_df <- tibble::tibble(
ymd_string = c("2022-05-11", "2022/05/12", "22.05-13"),
ydm_string = c("2022-11-05", "2022/12/05", "22.13-05"),
Expand Down Expand Up @@ -1776,3 +1776,35 @@ test_that("year, month, day date/time parsers work", {
test_df
)
})

test_that("ym, my & yq parsers", {
test_df <- tibble::tibble(
ym_string = c("2022-05", "2022/02", "22.03", "1979//12", "88.09", NA),
my_string = c("05-2022", "02/2022", "03.22", "12//1979", "09.88", NA),
yq_string = c("2007.3", "1970.2", "2020.1", "2009.4", "1975.1", NA),
yq_numeric = c(2007.3, 1970.2, 2020.1, 2009.4, 1975.1, NA),
)
dragosmg marked this conversation as resolved.
Show resolved Hide resolved

# these functions' internals use some string processing which requires the
# RE2 library (not available on Windows with R 3.6)
skip_if_not_available("re2")
compare_dplyr_binding(
.input %>%
mutate(
ym_date = ym(ym_string),
ym_datetime = ym(ym_string, tz = "Pacific/Marquesas"),
my_date = my(my_string),
my_datetime = my(my_string, tz = "Pacific/Marquesas"),
yq_date_from_string = yq(yq_string),
yq_datetime_from_string = yq(yq_string, tz = "Pacific/Marquesas"),
yq_date_from_numeric = yq(yq_numeric),
yq_datetime_from_numeric = yq(yq_numeric, tz = "Pacific/Marquesas"),
ym_date2 = parse_date_time(ym_string, orders = c("ym", "ymd")),
my_date2 = parse_date_time(my_string, orders = c("my", "myd")),
yq_date_from_string2 = parse_date_time(yq_string, orders = "yq"),
yq_date_from_numeric2 = parse_date_time(yq_numeric, orders = "yq")
) %>%
collect(),
test_df
)
})