Skip to content

Commit

Permalink
ARROW-16653: [R] All formats are supported with the lubridate `parse_…
Browse files Browse the repository at this point in the history
…date_time` binding (#13506)

This is to resolve [ARROW-16653](https://issues.apache.org/jira/browse/ARROW-16653).

Please note the intent here is to map out currently available formats via a new test. That is to support [ARROW-16395](https://issues.apache.org/jira/browse/ARROW-16395) which will add `ymd_hms() ymd_hm() ymd_h() dmy_hms() dmy_hm() dmy_h() mdy_hms() mdy_hm() mdy_h() ydm_hms() ydm_hm() ydm_h()`.

Currently most [lubridate supported flags](https://lubridate.tidyverse.org/reference/parse_date_time.html#details) are available and this adds a test. Remaining are `%q` and `%Op` that we don't need to resolve [ARROW-16395](https://issues.apache.org/jira/browse/ARROW-16395). We could open a ticket for adding support for the two remaining flags to C++ `strptime` or wait for users to request them.

Lead-authored-by: Rok <rok@mihevc.org>
Co-authored-by: Rok Mihevc <rok@mihevc.org>
Signed-off-by: Dewey Dunnington <dewey@fishandwhistle.net>
  • Loading branch information
rok authored and kszucs committed Jul 29, 2022
1 parent 7ca43fb commit 5116973
Show file tree
Hide file tree
Showing 2 changed files with 292 additions and 83 deletions.
110 changes: 53 additions & 57 deletions r/R/dplyr-datetime-helpers.R
Expand Up @@ -163,67 +163,42 @@ build_formats <- function(orders) {
# process the `orders` (even if supplied in the desired format)
# Processing is needed (instead of passing
# formats as-is) due to the processing of the character vector in parse_date_time()

orders <- gsub("[^A-Za-z]", "", orders)
orders <- gsub("Y", "y", orders)

valid_formats <- "[a|A|b|B|d|H|I|j|m|Om|M|Op|p|q|OS|S|U|w|W|y|Y|r|R|T|z]"
invalid_orders <- nchar(gsub(valid_formats, "", orders)) > 0

if (any(invalid_orders)) {
arrow_not_supported(
paste0(
oxford_paste(
orders[invalid_orders]
),
" `orders`"
)
)
}

# we separate "ym', "my", and "yq" from the rest of the `orders` vector and
# transform them. `ym` and `yq` -> `ymd` & `my` -> `myd`
# this is needed for 2 reasons:
# 1. strptime does not parse "2022-05" -> we add "-01", thus changing the format,
# 2. for equivalence to lubridate, which parses `ym` to the first day of the month
short_orders <- c("ym", "my")
short_orders <- c("ym", "my", "yOm", "Omy")
quarter_orders <- c("yq", "qy")

if (any(orders %in% short_orders)) {
orders1 <- setdiff(orders, short_orders)
orders2 <- intersect(orders, short_orders)
orders2 <- paste0(orders2, "d")
orders <- unique(c(orders2, orders1))
}

if (any(orders == "yq")) {
orders1 <- setdiff(orders, "yq")
orders2 <- "ymd"
orders <- unique(c(orders1, orders2))
}

if (any(orders == "qy")) {
orders1 <- setdiff(orders, "qy")
orders2 <- "ymd"
orders <- unique(c(orders1, orders2))
}

ymd_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
ymd_hms_orders <- c(
"ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS",
"mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H"
)
# support "%I" hour formats
ymd_ims_orders <- gsub("H", "I", ymd_hms_orders)

supported_orders <- c(
ymd_orders,
ymd_hms_orders,
gsub("_", " ", ymd_hms_orders), # allow "_", " " and "" as order separators
gsub("_", "", ymd_hms_orders),
ymd_ims_orders,
gsub("_", " ", ymd_ims_orders), # allow "_", " " and "" as order separators
gsub("_", "", ymd_ims_orders)
)

unsupported_passed_orders <- setdiff(orders, supported_orders)
supported_passed_orders <- intersect(orders, supported_orders)

# error only if there isn't at least one valid order we can try
if (length(supported_passed_orders) == 0) {
arrow_not_supported(
paste0(
oxford_paste(
unsupported_passed_orders
),
" `orders`"
)
)
if (any(orders %in% quarter_orders)) {
orders <- c(setdiff(orders, quarter_orders), "ymd")
}
orders <- unique(orders)

formats_list <- map(orders, build_format_from_order)
formats <- purrr::flatten_chr(formats_list)
Expand All @@ -239,26 +214,47 @@ build_formats <- function(orders) {
#'
#' @noRd
build_format_from_order <- function(order) {
month_formats <- c("%m", "%B", "%b")
week_formats <- c("%a", "%A")
year_formats <- c("%y", "%Y")
char_list <- list(
"y" = c("%y", "%Y"),
"m" = c("%m", "%B", "%b"),
"d" = "%d",
"H" = "%H",
"M" = "%M",
"S" = "%S",
"I" = "%I"
"%y" = year_formats,
"%Y" = year_formats,
"%m" = month_formats,
"%Om" = month_formats,
"%b" = month_formats,
"%B" = month_formats,
"%a" = week_formats,
"%A" = week_formats,
"%d" = "%d",
"%H" = "%H",
"%j" = "%j",
"%OS" = "%OS",
"%I" = "%I",
"%S" = "%S",
"%q" = "%q",
"%M" = "%M",
"%U" = "%U",
"%w" = "%w",
"%W" = "%W",
"%p" = "%p",
"%Op" = "%Op",
"%z" = "%z",
"%r" = c("%H", "%I-%p"),
"%R" = c("%H-%M", "%I-%M-%p"),
"%T" = c("%I-%M-%S-%p", "%H-%M-%S", "%H-%M-%OS")
)

split_order <- strsplit(order, split = "")[[1]]

split_order <- regmatches(order, gregexpr("(O{0,1}[a-zA-Z])", order))[[1]]
split_order <- paste0("%", split_order)
outcome <- expand.grid(char_list[split_order])

# we combine formats with and without the "-" separator, we will later
# coalesce through all of them (benchmarking indicated this is a more
# computationally efficient approach rather than figuring out if a string has
# separators or not and applying only )
# during parsing if the string to be parsed does not contain a separator
# separators or not and applying the relevant order afterwards)
formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
formats_without_sep <- do.call(paste, c(outcome, sep = ""))
formats_without_sep <- gsub("-", "", formats_with_sep)
c(formats_with_sep, formats_without_sep)
}

Expand Down

0 comments on commit 5116973

Please sign in to comment.