In this notebook, we will continue discussing regular expression related tools:

* [Grouped matches](#Grouped-matches)
* [Replacing matches](#Replacing-matches)
* [Splitting](#Splitting)
* [Find matches](#Find-matches)

# Grouped matches

In [1]:
library(tidyverse)
library(stringr)

Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------
filter(): dplyr, stats
lag():    dplyr, stats


In [2]:
regex <- "^(.*)(ing)$"
(ing_words <- str_subset(words, regex))

In [3]:
ing_words %>%
    str_extract(regex)

In [4]:
ing_words %>%
    str_match(regex)

0,1,2
bring,br,ing
during,dur,ing
evening,even,ing
king,k,ing
meaning,mean,ing
morning,morn,ing
ring,r,ing
sing,s,ing
thing,th,ing


# Replacing matches

In [5]:
(to_words <- str_subset(words, "to"))

In [6]:
str_replace(to_words, "to", "2")

In [7]:
(for_words <- str_subset(words, "for"))

In [8]:
str_replace(for_words, "for", "4")

In [9]:
(must_sentences <- str_subset(sentences, "must"))

We can also use backreferences in `str_replace`.

In [10]:
str_replace(must_sentences, "(.*) ([^ ]*) must (.*)\\.", "\\1 \\3, \\2 must")

Let's swap the first and last letter in each word in `words`.

In [11]:
words %>% head(10)
str_replace(words, "^(.)(.*)(.)$", "\\3\\2\\1") %>% head(10)

# Splitting

In [12]:
words %>%
    head(10) %>%
    str_split("[aeiou]") # split up words by vowels

In [13]:
words %>%
    head(10) %>%
    str_split("[aeiou]", simplify = TRUE) # to get a matrix

0,1,2,3,4
,,,,
,bl,,,
,b,,t,
,bs,l,t,
,cc,pt,,
,cc,,nt,
,ch,,v,
,cr,ss,,
,ct,,,
,ct,v,,


In [14]:
sentences %>%
    head(10) %>%
    str_split("\\s") # split sentences on whitespace

In [15]:
words %>%
    head(10) %>%
    str_split("") # splitting by empty string splits into individual characters

# Find matches

In [16]:
regex <- "th"
words %>%
    str_subset(regex) %>%
    str_locate(regex)

start,end
3,4
4,5
3,4
4,5
3,4
3,4
4,5
4,5
3,4
3,4


In [17]:
library(stringi) # for the stri_dup function

In [18]:
x <- c("You owe me $560", "I gave you $40", "We owe $3100 to the bank")
for (str in x) {
    location <- str_locate(str, "\\$\\d+")
    start <- location[1]
    end <- location[2]
    str_sub(str, start, end) <- str_c("$", stri_dup("X", end-start))
    print(str)
}

[1] "You owe me $XXX"
[1] "I gave you $XX"
[1] "We owe $XXXX to the bank"
