Skip to content

Commit

Permalink
ARROW-16516: [R] Implement ym() my() and yq() parsers
Browse files Browse the repository at this point in the history
The `ym()`, `my()` and `yq()` bindings will make the following possible (and identical):

``` r
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)

test_df <- tibble::tibble(
  ym_string = c("2022-05", "2022/02", "22.03", NA)
)

test_df %>%
  mutate(ym_date = ym(ym_string))
#> # A tibble: 4 × 2
#>   ym_string ym_date
#>   <chr>     <date>
#> 1 2022-05   2022-05-01
#> 2 2022/02   2022-02-01
#> 3 22.03     2022-03-01
#> 4 <NA>      NA

test_df %>%
  arrow_table() %>%
  mutate(ym_date = ym(ym_string)) %>%
  collect()
#> # A tibble: 4 × 2
#>   ym_string ym_date
#>   <chr>     <date>
#> 1 2022-05   2022-05-01
#> 2 2022/02   2022-02-01
#> 3 22.03     2022-03-01
#> 4 <NA>      NA
```

<sup>Created on 2022-05-16 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup>

I've implementing this with the following steps:
* add `"-01"` to the end of the strings we're trying to parse, and then
* use one the supported `orders` (`"ymd"` or `"myd"`)

Closes #13163 from dragosmg/ym_my_yq_parsers

Authored-by: Dragoș Moldovan-Grünfeld <dragos.mold@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
  • Loading branch information
dragosmg authored and thisisnic committed May 18, 2022
1 parent 0742f78 commit 60f6caf
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 8 deletions.
23 changes: 22 additions & 1 deletion r/R/dplyr-datetime-helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,26 @@ build_formats <- function(orders) {
orders <- gsub("[^A-Za-z_]", "", orders)
orders <- gsub("Y", "y", orders)

# we separate "ym', "my", and "yq" from the rest of the `orders` vector and
# transform them. `ym` and `yq` -> `ymd` & `my` -> `myd`
# this is needed for 2 reasons:
# 1. strptime does not parse "2022-05" -> we add "-01", thus changing the format,
# 2. for equivalence to lubridate, which parses `ym` to the first day of the month
short_orders <- c("ym", "my")

if (any(orders %in% short_orders)) {
orders1 <- setdiff(orders, short_orders)
orders2 <- intersect(orders, short_orders)
orders2 <- paste0(orders2, "d")
orders <- unique(c(orders1, orders2))
}

if (any(orders == "yq")) {
orders1 <- setdiff(orders, "yq")
orders2 <- "ymd"
orders <- unique(c(orders1, orders2))
}

supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
unsupported_passed_orders <- setdiff(orders, supported_orders)
supported_passed_orders <- intersect(orders, supported_orders)
Expand All @@ -176,7 +196,8 @@ build_formats <- function(orders) {
}

formats_list <- map(orders, build_format_from_order)
purrr::flatten_chr(formats_list)
formats <- purrr::flatten_chr(formats_list)
unique(formats)
}

build_format_from_order <- function(order) {
Expand Down
84 changes: 78 additions & 6 deletions r/R/dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -493,27 +493,99 @@ register_bindings_datetime_parsers <- function() {
# each order is translated into possible formats
formats <- build_formats(orders)

x <- x$cast(string())

# make all separators (non-letters and non-numbers) into "-"
x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x)
# collapse multiple separators into a single one
x <- call_binding("gsub", "-{2,}", "-", x)

# we need to transform `x` when orders are `ym`, `my`, and `yq`
# for `ym` and `my` orders we add a day ("01")
augmented_x <- NULL
if (any(orders %in% c("ym", "my"))) {
augmented_x <- call_binding("paste0", x, "-01")
}

# for `yq` we need to transform the quarter into the start month (lubridate
# behaviour) and then add 01 to parse to the first day of the quarter
augmented_x2 <- NULL
if (any(orders == "yq")) {
# extract everything that comes after the `-` separator, i.e. the quarter
# (e.g. 4 from 2022-4)
quarter_x <- call_binding("gsub", "^.*?-", "", x)
# we should probably error if quarter is not in 1:4
# extract everything that comes before the `-`, i.e. the year (e.g. 2002
# in 2002-4)
year_x <- call_binding("gsub", "-.*$", "", x)
quarter_x <- quarter_x$cast(int32())
month_x <- (quarter_x - 1) * 3 + 1
augmented_x2 <- call_binding("paste0", year_x, "-", month_x, "-01")
}

# TODO figure out how to parse strings that have no separators
# https://issues.apache.org/jira/browse/ARROW-16446
# we could insert separators at the "likely" positions, but it might be
# tricky given the possible combinations between dmy formats + locale

# build a list of expressions for each format
parse_attempt_expressions <- list()

for (i in seq_along(formats)) {
parse_attempt_expressions[[i]] <- build_expr(
parse_attempt_expressions <- map(
formats,
~ build_expr(
"strptime",
x,
options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE)
options = list(
format = .x,
unit = 0L,
error_is_null = TRUE
)
)
)

# build separate expression lists of parsing attempts for the orders that
# need an augmented `x`
# list for attempts when orders %in% c("ym", "my")
parse_attempt_exp_augmented_x <- list()

if (!is.null(augmented_x)) {
parse_attempt_exp_augmented_x <- map(
formats,
~ build_expr(
"strptime",
augmented_x,
options = list(
format = .x,
unit = 0L,
error_is_null = TRUE
)
)
)
}

# list for attempts when orders %in% c("yq")
parse_attempt_exp_augmented_x2 <- list()
if (!is.null(augmented_x2)) {
parse_attempt_exp_augmented_x2 <- map(
formats,
~ build_expr(
"strptime",
augmented_x2,
options = list(
format = .x,
unit = 0L,
error_is_null = TRUE
)
)
)
}

# combine all attempts expressions in prep for coalesce
parse_attempt_expressions <- c(
parse_attempt_expressions,
parse_attempt_exp_augmented_x,
parse_attempt_exp_augmented_x2
)

coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions)

# we need this binding to be able to handle a NULL `tz`, which will then be
Expand All @@ -527,7 +599,7 @@ register_bindings_datetime_parsers <- function() {

})

ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym", "ym", "my", "yq")

ymd_parser_map_factory <- function(order) {
force(order)
Expand Down
38 changes: 37 additions & 1 deletion r/tests/testthat/test-dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -1735,7 +1735,7 @@ test_that("parse_date_time() doesn't work with hour, minutes, and second compone
)
})

test_that("year, month, day date/time parsers work", {
test_that("year, month, day date/time parsers", {
test_df <- tibble::tibble(
ymd_string = c("2022-05-11", "2022/05/12", "22.05-13"),
ydm_string = c("2022-11-05", "2022/12/05", "22.13-05"),
Expand Down Expand Up @@ -1776,3 +1776,39 @@ test_that("year, month, day date/time parsers work", {
test_df
)
})

test_that("ym, my & yq parsers", {
test_df <- tibble::tibble(
ym_string = c("2022-05", "2022/02", "22.03", "1979//12", "88.09", NA),
my_string = c("05-2022", "02/2022", "03.22", "12//1979", "09.88", NA),
yq_string = c("2007.3", "1970.2", "2020.1", "2009.4", "1975.1", NA),
yq_numeric = c(2007.3, 1970.2, 2020.1, 2009.4, 1975.1, NA),
yq_space = c("2007 3", "1970 2", "2020 1", "2009 4", "1975 1", NA)
)

# these functions' internals use some string processing which requires the
# RE2 library (not available on Windows with R 3.6)
skip_if_not_available("re2")
compare_dplyr_binding(
.input %>%
mutate(
ym_date = ym(ym_string),
ym_datetime = ym(ym_string, tz = "Pacific/Marquesas"),
my_date = my(my_string),
my_datetime = my(my_string, tz = "Pacific/Marquesas"),
yq_date_from_string = yq(yq_string),
yq_datetime_from_string = yq(yq_string, tz = "Pacific/Marquesas"),
yq_date_from_numeric = yq(yq_numeric),
yq_datetime_from_numeric = yq(yq_numeric, tz = "Pacific/Marquesas"),
yq_date_from_string_with_space = yq(yq_space),
yq_datetime_from_string_with_space = yq(yq_space, tz = "Pacific/Marquesas"),
ym_date2 = parse_date_time(ym_string, orders = c("ym", "ymd")),
my_date2 = parse_date_time(my_string, orders = c("my", "myd")),
yq_date_from_string2 = parse_date_time(yq_string, orders = "yq"),
yq_date_from_numeric2 = parse_date_time(yq_numeric, orders = "yq"),
yq_date_from_string_with_space2 = parse_date_time(yq_space, orders = "yq")
) %>%
collect(),
test_df
)
})

0 comments on commit 60f6caf

Please sign in to comment.