Skip to content

Latest commit

 

History

History
1522 lines (1244 loc) · 47.1 KB

stringr-replace.md

File metadata and controls

1522 lines (1244 loc) · 47.1 KB

Benchmark of stringr replacement

Context

As part of #1549, we removed stringr dependency by replacing all functions used with base R equivalent.

This document aim to be a benchmark of each replacement. We’ll use bench to do the benchmark, and compare stringr function with its replacement. Each benchmark will be done in the context of there initial usage in knitr.

Session info

xfun::session_info(c("knitr", "bench", "stringr"))
## R version 4.2.0 (2022-04-22 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Locale:
##   LC_COLLATE=French_France.utf8  LC_CTYPE=French_France.utf8   
##   LC_MONETARY=French_France.utf8 LC_NUMERIC=C                  
##   LC_TIME=French_France.utf8    
## 
## Package version:
##   bench_1.1.2     cli_3.6.0       evaluate_0.20   fansi_1.0.3    
##   glue_1.6.2      graphics_4.2.0  grDevices_4.2.0 highr_0.10     
##   knitr_1.41.9    lifecycle_1.0.3 magrittr_2.0.3  methods_4.2.0  
##   pillar_1.8.1    pkgconfig_2.0.3 profmem_0.6.0   rlang_1.0.6    
##   stats_4.2.0     stringi_1.7.12  stringr_1.5.0   tibble_3.1.8   
##   tools_4.2.0     utf8_1.2.2      utils_4.2.0     vctrs_0.5.1    
##   xfun_0.36       yaml_2.3.6

Helpers

knitr_example = function(...) system.file('examples', ..., package = 'knitr')

Benchmarking

stringr::str_replace_all()

Commit PR File
9c92eff1 #2174 utils-vignettes.R#L158

Before:

x[!i] = stringr::str_replace_all(x[!i], p$inline.code, '') # remove inline code

After:

x[!i] = gsub(p$inline.code, '', x[!i], perl = TRUE) # remove inline code
file = knitr_example('knitr-minimal.Rnw')
x = xfun::read_utf8(file)
p = knitr:::detect_pattern(x, tolower(xfun::file_ext(file)))
p = knitr:::all_patterns[[p]]
p1 = p$chunk.begin; p2  = p$chunk.end
m = knitr:::group_indices(grepl(p1, x), grepl(p2, x))
i = m %% 2 == 0
res = bench::mark(
  stringr = stringr::str_replace_all(x[!i], p$inline.code, ''),
  new = gsub(p$inline.code, '', x[!i], perl = TRUE),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1.29   1.38      1        1226.     5.99
## 2 new         1      1         1.34        1      1
ggplot2::autoplot(res)
## Le chargement a nécessité le package : tidyr

stringr::str_trim()

Commit PR File
8fa7d17 #2177

parser.R#L491

utils.R#L77

Before:

labels = stringr::str_trim(gsub(lab, '\\3', sapply(groups, `[`, 1)))

After:

x[!i] = gsub(p$inline.code, '', x[!i], perl = TRUE) # remove inline code
path = "https://raw.githubusercontent.com/yihui/knitr-examples/46c8d1db0cf0c9ab04432444079927324c4c3688/113-foo.R"
lines = xfun::read_utf8(path)
lab = knitr:::.sep.label
idx = cumsum(grepl(lab, lines))
groups = unname(split(lines, idx))
content = gsub(lab, '\\3', sapply(groups, `[`, 1))

res = bench::mark(
  stringr = stringr::str_trim(content),
  new = trimws(content),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         2.33       Inf     7.76
## 2 new         2.48   2.54      1          NaN     1
ggplot2::autoplot(res)

Commit PR File
8fa7d17 #2177 utils.R#L77

Before:

stringr::str_trim(stringr::str_split(string, ';|,')[[1]])

After:

trimws(stringr::str_split(string, ';|,')[[1]])
string = ' .5,.6 , .7; .9 '
splitted = stringr::str_split(string, ';|,')[[1]]

res = bench::mark(
  stringr = stringr::str_trim(splitted),
  new = trimws(splitted),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         2.32       Inf     7.74
## 2 new         2.47   2.48      1          NaN     1
ggplot2::autoplot(res)

stringr::str_dup()

Commit PR File
1ce8286 #2186 parser.R#L348 |

Before:

cat(' ', stringr::str_dup('~', getOption('width') - 10L), '\n')

After:

rep_str = function(x, n, sep = '') paste(rep(x, n), collapse = sep)
cat(' ', rep_str('~', getOption('width') - 10L), '\n')
res = bench::mark(
  stringr = stringr::str_dup('~', getOption('width') - 10L),
  new = paste(rep('~', getOption('width') - 10L), collapse = ''),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         1.20      5.96      Inf
## 2 new         1.17   1.14      1         1         NaN
ggplot2::autoplot(res)

Later, usage of stringr::str_dup() and its replacement rep_str() have been removed completely in (https://github.com/yihui/knitr/commit/0efb7914a1be93460e510bb283f9d4d72c49b360) as code was simplified in https://github.com/yihui/knitr/commit/07bf3adf49f67bd528be4e6a60b3284a7ea6f8a9 and https://github.com/yihui/knitr/commit/7ef7be00711de949fcf2a299d6381189669bb753 so no more impact.

stringr::str_split()

Commit PR File
67b973 #2187

pandoc.R#L125

utils.R#L77

Before:

x = stringr::str_split(x, '\n')

After:

# patch strsplit() to split '' into '' instead of character(0)
str_split = function(x, split, ...) {
  y = strsplit(x, split, ...)
  y[x == ''] = list('')
  y
}

x = str_split(x, '\n')
con = url("https://raw.githubusercontent.com/yihui/knitr-examples/46c8d1db0cf0c9ab04432444079927324c4c3688/084-pandoc.pandoc")
x = read.dcf(con)
close(con)
x = x[!is.na(x)]
str_split = function(x, split, ...) {
  y = strsplit(x, split, ...)
  y[x == ''] = list('')
  y
}
res = bench::mark(
  stringr = stringr::str_split(x, '\n'),
  new = str_split(x, '\n'),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     2.48   2.48      1         2.38     1   
## 2 new         1      1         2.38      1        1.19
ggplot2::autoplot(res)

stringr::str_sub()

Commit PR File
5a2cb72 #2195 utils-string.R
block.R#L564
utils.R#L92-L93

Before:

stringr::str_sub(input, loc[i, 1], loc[i, 2]) = if (length(res)) {
      paste(hook(res), collapse = '')
    } else ''

After:

# replace parts of a string with new values; `pos` is a matrix of positions and
# each row is a pair of [start, end]
str_replace = function(x, pos, value) {
  if (length(x) != 1) stop("Only a character scalar is supported.")
  # extract parts of the string that are outside [start, end]
  m = rbind(pos[, 1] - 1, pos[, 2] + 1)
  m = matrix(c(1, m, nchar(x)), nrow = 2)
  y = substring(x, m[1, ], m[2, ])
  paste(rbind(y, c(value, '')), collapse = '')
}

str_replace(input, block$location, ans)
input = "inline `r (function() 1)()`"
location = matrix(c(8,27), ncol = 2, byrow = TRUE)
ans = "1"

str_replace = function(x, pos, value) {
  if (length(x) != 1) stop("Only a character scalar is supported.")
  # extract parts of the string that are outside [start, end]
  m = rbind(pos[, 1] - 1, pos[, 2] + 1)
  m = matrix(c(1, m, nchar(x)), nrow = 2)
  y = substring(x, m[1, ], m[2, ])
  paste(rbind(y, c(value, '')), collapse = '')
}

# wrapper to approximate what is done internally
old_replace = function(input, loc, ans) {
  stringr::str_sub(input, loc[1, 1], loc[1, 2]) = ans
  return(input)
}
res = bench::mark(
  stringr = old_replace(input, location, ans),
  new = str_replace(input, location, ans),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         1.21      9.19     1.21
## 2 new         1.25   1.20      1         1        1
ggplot2::autoplot(res)

Commit PR File
5a2cb72 #2195 header.R#L59-L60
header.R#L90-L91

Before:

tmp = stringr::str_sub(doc[i], l[, 1], l[, 2])
stringr::str_sub(doc[i], l[,1], l[,2]) = paste0(tmp, '\n', make_header_html())

After :

tmp = substr(doc[i], l[, 1], l[, 2])
doc[i] = str_replace(doc[i], l, paste0(tmp, '\n', make_header_html()))
b = knitr::all_patterns$html$header.begin
doc = c("<head>", "<!--content-->", "</head>", "<body>", "</body>")
i = grep(b, doc)
l = stringr::str_locate(doc[i], b)
res = bench::mark(
  stringr = stringr::str_sub(doc[i], l[, 1], l[, 2]),
  new = substr(doc[i], l[, 1], l[, 2]),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     3.48   3.45      1          Inf     1   
## 2 new         1      1         3.59       NaN     3.59
ggplot2::autoplot(res)

tmp = stringr::str_sub(doc[i], l[, 1], l[, 2])
header = knitr:::make_header_html()

old_fun = function(doc, l, tmp, header) {
  stringr::str_sub(doc[i], l[,1], l[,2]) = paste0(tmp, header)
  doc
}

new_fun = function(doc, l, tmp, header) {
  doc[i] = str_replace(doc[i], l, paste0(tmp, header))
  doc
}

res = bench::mark(
  stringr = old_fun(doc, l, tmp, header),
  new = new_fun(doc, l, tmp, header),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         1.13       Inf     1   
## 2 new         1.19   1.17      1          NaN     1.33
ggplot2::autoplot(res)

Before:

tmp = stringr::str_sub(doc[i], l[, 1], l[, 2])
stringr::str_sub(doc[i], l[,1], l[,2]) = paste0(tmp, make_header_latex(doc))

After :

tmp = substr(doc[i], l[, 1], l[, 2])
doc[i] = str_replace(doc[i], l, paste0(tmp, make_header_latex(doc)))
b = knitr::all_patterns$tex$header.begin
doc = c("\\documentclass[opt]{article}", "some tex content")
i = grep(b, doc)
l = stringr::str_locate(doc[i], b)
res = bench::mark(
  stringr = stringr::str_sub(doc[i], l[, 1], l[, 2]),
  new = substr(doc[i], l[, 1], l[, 2]),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     3.55   3.39      1          Inf     1   
## 2 new         1      1         3.42       NaN     3.42
ggplot2::autoplot(res)

tmp = stringr::str_sub(doc[i], l[, 1], l[, 2])
header = knitr:::make_header_latex(doc)

old_fun = function(doc, l, tmp, header) {
  stringr::str_sub(doc[i], l[,1], l[,2]) = paste0(tmp, header)
  doc
}

new_fun = function(doc, l, tmp, header) {
  doc[i] = str_replace(doc[i], l, paste0(tmp, header))
  doc
}

res = bench::mark(
  stringr = old_fun(doc, l, tmp, header),
  new = new_fun(doc, l, tmp, header),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         1.20       Inf     1   
## 2 new         1.25   1.25      1          NaN     1.25
ggplot2::autoplot(res)

Variant replacement str_insert() instead or str_replace()

Commit PR File
5a2cb72 #2195 header.R#L59-L60
header.R#L90-L91

Before:

tmp = stringr::str_sub(doc[i], l[, 1], l[, 2])
stringr::str_sub(doc[i], l[,1], l[,2]) = paste0(tmp, '\n', make_header_html())

After :

doc[i] = str_insert(doc[i], l[, 2], paste0('\n', make_header_html()))
b = knitr::all_patterns$html$header.begin
doc = c("<head>", "<!--content-->", "</head>", "<body>", "</body>")
i = grep(b, doc)
l = stringr::str_locate(doc[i], b)
header = knitr:::make_header_html()

str_insert = function(x, i, value) {
  if (i <= 0) return(paste0(value, x))
  n = nchar(x)
  if (n == 0 || i >= n) return(paste0(x, value))
  paste0(substr(x, 1, i), value, substr(x, i + 1, n))
}

old_str_sub = function(doc, l, header) {
  tmp = stringr::str_sub(doc[i], l[, 1], l[, 2])
  stringr::str_sub(doc[i], l[,1], l[,2]) = paste0(tmp, header)
  doc
}

new_str_replace = function(doc, l, header) {
  tmp = substr(doc[i], l[, 1], l[, 2])
  doc[i] = str_replace(doc[i], l, paste0(tmp, header))
  doc
}

new_str_insert = function(doc, l, header) {
  doc[i] = str_insert(doc[i], l[, 2], paste0('\n', header))
  doc
}

res = bench::mark(
  stringr = old_str_sub(doc, l, header),
  str_replace = new_str_replace(doc, l, header),
  str_insert = new_str_insert(doc, l, header),
  min_time = Inf,
  # we get a small difference in output
  check = FALSE
)
summary(res, relative = TRUE)
## # A tibble: 3 × 6
##   expression    min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr>  <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr      2.81   2.74      1          Inf     1.28
## 2 str_replace  2.47   2.41      1.11       NaN     1.89
## 3 str_insert   1      1         2.34       NaN     1
ggplot2::autoplot(res)

Special edge case here: Our replacement function is not completely equivalent.

  • New version gives <head>\n<style type="text/css">\n\n</style>
  • stringr version gives <head><style type="text/css">\n\n</style>

Before:

tmp = stringr::str_sub(doc[i], l[, 1], l[, 2])
stringr::str_sub(doc[i], l[,1], l[,2]) = paste0(tmp, make_header_latex(doc))

After :

doc[i] = str_insert(doc[i], l[, 2], make_header_latex(doc))
b = knitr::all_patterns$tex$header.begin
doc = c("\\documentclass[opt]{article}", "some tex content")
i = grep(b, doc)
l = stringr::str_locate(doc[i], b)
header = knitr:::make_header_latex(doc)

old_str_sub = function(doc, l, header) {
  tmp = stringr::str_sub(doc[i], l[, 1], l[, 2])
  stringr::str_sub(doc[i], l[,1], l[,2]) = paste0(tmp, header)
  doc
}

new_str_replace = function(doc, l, header) {
  tmp = substr(doc[i], l[, 1], l[, 2])
  doc[i] = str_replace(doc[i], l, paste0(tmp, header))
  doc
}

new_str_insert = function(doc, l, header) {
  doc[i] = str_insert(doc[i], l[, 2], header)
  doc
}

res = bench::mark(
  stringr = old_str_sub(doc, l, header),
  str_replace = new_str_replace(doc, l, header),
  str_insert = new_str_insert(doc, l, header),
  min_time = Inf
)

summary(res, relative = TRUE)
## # A tibble: 3 × 6
##   expression    min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr>  <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr      3.34   3.31      1          Inf      Inf
## 2 str_replace  3.05   3.02      1.14       NaN      Inf
## 3 str_insert   1      1         3.44       NaN      NaN
ggplot2::autoplot(res)

stringr::str_detect()

Commit PR File
1a0f2cc #2202 pandoc.R#L125

Before:

if (length(pat) && any(stringr::str_detect(text, pat))) return(p)

After:

if (length(pat) && any(grepl(pat, text, perl = TRUE))) return(p)
pat = knitr:::all_patterns$md$chunk.begin
text = xfun::read_utf8(knitr_example("knitr-spin.Rmd"))
res = bench::mark(
  stringr = stringr::str_detect(text, pat),
  new = grepl(pat, text, perl = TRUE),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         2.13      76.1      Inf
## 2 new         2.20   2.24      1          1        NaN
ggplot2::autoplot(res)

stringr::str_wrap()

Commit PR File
1a0f2cc #2202 citation.R#L135

Before:

b[-idx] = stringr::str_wrap(b[-idx], width, 2, 4)

After:

str_wrap = function(...) {
  res = strwrap(..., simplify = FALSE)
  unlist(lapply(res, one_string))
  }
b[-idx] = str_wrap(b[-idx], width, 2, 4)
x = "knitr"
lib.loc = NULL
tweak = TRUE
prefix = 'R-'
.tweak.bib = knitr:::.tweak.bib
citation = function(...) utils::citation(..., lib.loc = lib.loc)
bib = sapply(x, function(pkg) {
  cite = citation(pkg, auto = if (pkg != 'base') {
    meta = packageDescription(pkg, lib.loc = lib.loc)
    # don't use the CRAN URL if the package has provided its own URL
    if (identical(meta$Repository, 'CRAN') && !is.null(meta$URL)) {
      # however, the package may have provided multiple URLs, in which case we
      # still use the CRAN URL
      if (!grepl('[, ]', meta$URL)) meta$Repository = NULL
    }
    meta
  })
  entry = toBibtex(cite)
  entry[1] = sub('\\{,$', sprintf('{%s%s,', prefix, pkg), entry[1])
  entry
}, simplify = FALSE)
str_wrap = function(...) {
  res = strwrap(..., simplify = FALSE)
  unlist(lapply(res, knitr:::one_string))
}
res = bench::mark(
  stringr = stringr::str_wrap(bib$knitr, 20, 2, 4),
  new = str_wrap(bib$knitr, 20, 2, 4),
  min_time = Inf, 
  check = FALSE # functions are not completely equivalent
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         5.20      1.74     4.87
## 2 new         5.12   5.48      1         1        1
ggplot2::autoplot(res)

Special edge case here: Our replacement function is not completely equivalent.

  • New version gives note = {R package\n version\n 1.41.9},
  • stringr version gives note = {R package\n version 1.41.9},
Commit PR File
1a0f2cc #2202 output.R#L503 |

Before:

stringr::str_wrap(message, width = getOption('width'))

After:

str_wrap = function(...) {
  res = strwrap(..., simplify = FALSE)
  unlist(lapply(res, one_string))
  }
str_wrap(message, width = getOption('width'))
warn_msg = "Warning function: This is a warning message quite loooooooooooong"
str_wrap = function(...) {
  res = strwrap(..., simplify = FALSE)
  unlist(lapply(res, knitr:::one_string))
}
res = bench::mark(
  stringr = stringr::str_wrap(warn_msg, 10),
  new = str_wrap(warn_msg, 10),
  min_time = Inf, 
  check = FALSE # functions are not completely equivalent
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         2.12       Inf     2.33
## 2 new         2.14   2.23      1          NaN     1
ggplot2::autoplot(res)

Special edge case here: Our replacement function is not completely equivalent.

  • New version gives Warning\nfunction:\nThis is\na warning\nmessage\nquite\nloooooooooooong
  • stringr version gives Warning\nfunction:\nThis is a\nwarning\nmessage\nquite\nloooooooooooong

stringr::str_pad()

Commit PR File
1a0f2cc #2202 block.R#L619

Before:

paste0('## ----', stringr::str_pad(label, max(getOption('width') - 11L, 0L), 'right', '-'), '----', code)

After:

paste0('## ----', label, strrep('-', max(getOption('width') - 11L - nchar(label), 0L)), '----', code)
label = "my-first-chunk"
res = bench::mark(
  stringr = paste0('## ----', stringr::str_pad(label, max(getOption('width') - 11L, 0L), 'right', '-'), '----'),
  new = paste0('## ----', label, strrep('-', max(getOption('width') - 11L - nchar(label), 0L)), '----'),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     13.0   12.9       1         Inf     1   
## 2 new          1      1        13.9       NaN     1.16
ggplot2::autoplot(res)

stringr::str_count()

Commit PR File
1a0f2cc #2202 utils.R#L583-L590

Before:

line_count_str = function(x) stringr::str_count(x, '\n') + 1L

After:

line_count = function(x) {
  res = gregexpr('\n', x, fixed = TRUE)
  unlist(lapply(res, function(x) {
    n = length(x)
    if (n == 1 && x == -1) n = 0
    n + 1
  }))
}
text = xfun::read_utf8(knitr_example("knitr-spin.Rmd"))
res = bench::mark(
  stringr = line_count_str(text),
  new = line_count(text),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         2.83       1        1  
## 2 new         2.27   2.97      1         23.7     18.4
ggplot2::autoplot(res)

stringr::str_extract_all

Commit PR File
cc3b92a #2205 block.R#L587

Before:

eval(parse_only(unlist(stringr::str_extract_all(code, 'read_chunk\\(([^)]+)\\)'))))

After:

str_extract = function(x, pattern) {
  m = gregexpr(pattern, x, perl = TRUE)
  regmatches(x, m)
}
eval(parse_only(unlist(str_extract(code, 'read_chunk\\(([^)]+)\\)'))))
code = "knitr::read_chunk('113-foo.R')"
str_extract = function(x, pattern) {
  m = gregexpr(pattern, x, perl = TRUE)
  regmatches(x, m)
}
res = bench::mark(
  stringr = stringr::str_extract_all(code, 'read_chunk\\(([^)]+)\\)'),
  new = str_extract(code, 'read_chunk\\(([^)]+)\\)'),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         3.75      1        1.50
## 2 new         3.81   3.98      1         4.83     1
ggplot2::autoplot(res)

Commit PR File
cc3b92a #2205 template.R#L123

Before:

mat = stringr::str_extract_all(txt, delim)[[1L]]

After:

str_extract = function(x, pattern) {
  m = gregexpr(pattern, x, perl = TRUE)
  regmatches(x, m)
}
mat = str_extract(txt, delim)[[1L]]
txt = 'This is the value of `x`: {{x}}'
delim = c('{{', '}}')
delim = gsub('([.|()\\^{}+$*?]|\\[|\\])', '\\\\\\1', delim)
delim = paste0(delim[1L], '((.|\n)+?)', delim[2L])
str_extract = function(x, pattern) {
  m = gregexpr(pattern, x, perl = TRUE)
  regmatches(x, m)
}
res = bench::mark(
  stringr = stringr::str_extract_all(txt, delim),
  new = str_extract(txt, delim),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         4.06       1       1.35
## 2 new         4.01   4.30      1         93.8     1
ggplot2::autoplot(res)

stringr::str_locate()

Commit PR File
cc3b92a #2205 header.R#L58
header.R#L89

Before:

stringr::str_locate(doc[i], b)

After:

location = function(x) {
  len = attr(x, 'match.length')
  if (length(x) == 1 && x == -1) x = integer()
  cbind(start = x, end = x + len - 1L)
}
str_locate = function(x, pattern, all = TRUE) {
  out = (if (all) gregexpr else regexpr)(pattern, x, perl = TRUE)
  if (all) lapply(out, location) else location(out)
}
str_locate(doc[i], b, FALSE)
b = knitr::all_patterns$html$header.begin
doc = c("<head>", "<!--content-->", "</head>", "<body>", "</body>")
i = grep(b, doc)
location = function(x) {
  len = attr(x, 'match.length')
  if (length(x) == 1 && x == -1) x = integer()
  cbind(start = x, end = x + len - 1L)
}
str_locate = function(x, pattern, all = TRUE) {
  out = (if (all) gregexpr else regexpr)(pattern, x, perl = TRUE)
  if (all) lapply(out, location) else location(out)
}
res = bench::mark(
  stringr = stringr::str_locate(doc[i], b),
  new = str_locate(doc[i], b, FALSE),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         3.59       1       7.17
## 2 new         3.55   3.74      1         75.8     1
ggplot2::autoplot(res)

Commit PR File
cc3b92a #2205 parser.R#L358

Before:

loc = stringr::str_locate_all(input, inline.code)[[1]]

After:

location = function(x) {
  len = attr(x, 'match.length')
  if (length(x) == 1 && x == -1) x = integer()
  cbind(start = x, end = x + len - 1L)
}
str_locate = function(x, pattern, all = TRUE) {
  out = (if (all) gregexpr else regexpr)(pattern, x, perl = TRUE)
  if (all) lapply(out, location) else location(out)
}
loc = str_locate(input, inline.code)[[1]]
input = 'Inline expressions such as `r "the following"`'
inline.code = knitr:::all_patterns$md$inline.code
res = bench::mark(
  stringr = stringr::str_locate_all(input, inline.code),
  new = str_locate(input, inline.code),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         3.05      1        1   
## 2 new         3.22   3.30      1         1.63     2.30
ggplot2::autoplot(res)

Commit PR File
cc3b92a #2205 utils.R#L94

Before:

idx = stringr::str_locate(txt, hb) # locate documentclass

After:

location = function(x) {
  len = attr(x, 'match.length')
  if (length(x) == 1 && x == -1) x = integer()
  cbind(start = x, end = x + len - 1L)
}
str_locate = function(x, pattern, all = TRUE) {
  out = (if (all) gregexpr else regexpr)(pattern, x, perl = TRUE)
  if (all) lapply(out, location) else location(out)
}
idx = str_locate(txt, hb, FALSE) # locate documentclass
input = xfun::read_utf8(knitr_example("child/knitr-main.Rnw"))
patterns = knitr:::all_patterns$tex
db = patterns$document.begin
hb = patterns$header.begin
idx2 = grep(db, input)[1]
idx1 = grep(hb, input)[1]
txt = knitr:::one_string(input[idx1:(idx2 - 1L)])
location = function(x) {
  len = attr(x, 'match.length')
  if (length(x) == 1 && x == -1) x = integer()
  cbind(start = x, end = x + len - 1L)
}
str_locate = function(x, pattern, all = TRUE) {
  out = (if (all) gregexpr else regexpr)(pattern, x, perl = TRUE)
  if (all) lapply(out, location) else location(out)
}
res = bench::mark(
  stringr = stringr::str_locate(txt, hb),
  new = str_locate(txt, hb, FALSE),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         3.22       Inf     6.44
## 2 new         3.17   3.36      1          NaN     1
ggplot2::autoplot(res)

stringr::str_locate_all()

Commit PR File
cc3b92a #2205 template.R#L121

Before:

loc = stringr::str_locate_all(txt, delim)[[1L]]

After:

location = function(x) {
  len = attr(x, 'match.length')
  if (length(x) == 1 && x == -1) x = integer()
  cbind(start = x, end = x + len - 1L)
}
str_locate = function(x, pattern, all = TRUE) {
  out = (if (all) gregexpr else regexpr)(pattern, x, perl = TRUE)
  if (all) lapply(out, location) else location(out)
}
loc = str_locate(txt, delim)[[1L]]
txt = 'This is the value of `x`: {{x}}'
delim = c('{{', '}}')
delim = gsub('([.|()\\^{}+$*?]|\\[|\\])', '\\\\\\1', delim)
delim = paste0(delim[1L], '((.|\n)+?)', delim[2L])
location = function(x) {
  len = attr(x, 'match.length')
  if (length(x) == 1 && x == -1) x = integer()
  cbind(start = x, end = x + len - 1L)
}
str_locate = function(x, pattern, all = TRUE) {
  out = (if (all) gregexpr else regexpr)(pattern, x, perl = TRUE)
  if (all) lapply(out, location) else location(out)
}
res = bench::mark(
  stringr = stringr::str_locate_all(txt, delim),
  new = str_locate(txt, delim),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         3.81       1       1   
## 2 new         3.85   3.96      1         93.8     1.05
ggplot2::autoplot(res)

stringr::str_match_all()

Commit PR File
cc3b92a #2205 parser.R#L360

Before:

code = stringr::str_match_all(input, inline.code)[[1L]]

After:

str_match = function(x, pattern) {
  # gregexec() was added in R 4.1.0; for lower versions of R, use fallback
  if (is.function(gregexec <- baseenv()[['gregexec']])) {
    m = gregexec(pattern, x, perl = TRUE)
  } else {
    x = unlist(str_extract(x, pattern))
    m = regexec(pattern, x, perl = TRUE)
  }
  do.call(cbind, regmatches(x, m))
}
code = t(str_match(input, inline.code))
input = 'Inline expressions such as `r "the following"`'
inline.code = knitr:::all_patterns$md$inline.code
str_match = function(x, pattern) {
  if (is.function(gregexec <- baseenv()[['gregexec']])) {
    m = gregexec(pattern, x, perl = TRUE)
  } else {
    x = unlist(str_extract(x, pattern))
    m = regexec(pattern, x, perl = TRUE)
  }
  do.call(cbind, regmatches(x, m))
}
res = bench::mark(
  stringr = { 
    code = stringr::str_match_all(input, inline.code)[[1L]]
    code[is.na(code)] = ''
    code
  },
  new = t(str_match(input, inline.code)),
  min_time = Inf
)
summary(res, relative = TRUE)
## # A tibble: 2 × 6
##   expression   min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
## 1 stringr     1      1         4.45      1        1.11
## 2 new         4.26   4.75      1         6.75     1
ggplot2::autoplot(res)