In this notebook, we will cover more topics related to regular expressions:

* [Repetition](#Repetition)
* [Grouping](#Grouping)
* [Detect matches](#Detect-matches)
* [Extract matches](#Extract-matches)

In [1]:
library(tidyverse)
library(stringr)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.3
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.4
[32m✔[39m [34mtidyr  [39m 1.0.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



# Repetition

We can control how many times a pattern matches:

* `?`: 0 or 1
* `+`: 1 or more
* `*`: 0 or more

In [2]:
x <- c("cat", "dog", "dogs", "cats")
str_view(x, "cats?")

In [3]:
str_view_all("Why? No really, why?", "\\?") # to match a literal question mark

In [4]:
x <- c("hiking is fun", "reading is fun", "driving is not fun", "flying is not fun", "biking is fun")
str_view(x, "is (not )?fun")

In [5]:
x <- c("abc", "bc", "aabc", "aaabc")
str_view(x, "a+")

In [6]:
x <- c("ac", "abc", "abbc", "xyz", "abbbbc")
str_view(x, "ab*c")

We can also specify the number of matches precisely:

* `{n}`: exactly n
* `{n,}`: n or more
* `{n,m}`: between n and m

In [7]:
str_view(x, "ab{2}c")

In [8]:
str_view(x, "ab{2,}c")

In [9]:
str_view(x, "ab{0,2}c")

# Grouping

Parentheses define groups that can be referred to as `\1`, `\2` etc.

The optional argument `match = TRUE` means that only string that match the patern will be returned.

In [10]:
x <- c("mom", "dad", "brother", "sister")
re <- "^(.).*\\1$" 
str_view(x, re, match = TRUE) # find strings that start and end with the same character

In [11]:
x <- c("he moved his head", "she moved her car", "nobody moved anything", "they moved their bikes")
re <- "(..).*\\1" # find a repeated pair of characters
str_view(x, re, match = TRUE)

# Detect matches

In [12]:
x <- c("x-ray", "something", "xylophone", "xenophobia", "nothing")
str_detect(x, "^x") # which strings start with x?

In [13]:
head(words, 10) # words is a predefined character vector 

In [14]:
sum(str_detect(words, "^a")) # no. of words that start with a

In [15]:
# for every letter, find how many words start with it
for (let in letters) {
    print(sprintf("%d words start with %s", sum(str_detect(words, str_c("^", let))), let))
}

[1] "65 words start with a"
[1] "58 words start with b"
[1] "83 words start with c"
[1] "43 words start with d"
[1] "45 words start with e"
[1] "54 words start with f"
[1] "23 words start with g"
[1] "38 words start with h"
[1] "25 words start with i"
[1] "6 words start with j"
[1] "9 words start with k"
[1] "45 words start with l"
[1] "45 words start with m"
[1] "23 words start with n"
[1] "28 words start with o"
[1] "72 words start with p"
[1] "7 words start with q"
[1] "46 words start with r"
[1] "119 words start with s"
[1] "65 words start with t"
[1] "12 words start with u"
[1] "8 words start with v"
[1] "53 words start with w"
[1] "0 words start with x"
[1] "6 words start with y"
[1] "0 words start with z"


In [16]:
sum(str_detect(words, "^[^aeiou]*$")) # no. of words with no vowels

In [17]:
words[str_detect(words, "^[^aeiou]*$")] # logical subsetting to get the words themselves

In [18]:
str_subset(words, "^[^aeiou]*$") # str_subset is a convenient shortcut

In [19]:
str_subset(words, "^[aeiou]*$") # words with only vowels

In [20]:
df <- tibble(
  word = words, 
  i = seq_along(word)
)
print(df)

[38;5;246m# A tibble: 980 x 2[39m
   word         i
   [3m[38;5;246m<chr>[39m[23m    [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m a            1
[38;5;250m 2[39m able         2
[38;5;250m 3[39m about        3
[38;5;250m 4[39m absolute     4
[38;5;250m 5[39m accept       5
[38;5;250m 6[39m account      6
[38;5;250m 7[39m achieve      7
[38;5;250m 8[39m across       8
[38;5;250m 9[39m act          9
[38;5;250m10[39m active      10
[38;5;246m# … with 970 more rows[39m


In [21]:
df %>%
    filter(str_detect(word, "^q")) # can use str_detect with the dplyr verb filter

word,i
<chr>,<int>
quality,665
quarter,666
question,667
quick,668
quid,669
quiet,670
quite,671


In [22]:
str_view_all("xyxyxyx", "xyx") # not how matches don't overlap

In [23]:
str_count("xyxyxyx", "xyx") # gives the number of matches

In [24]:
df <- df %>% # mutate with str_count
            mutate(num_vowels = str_count(word, "[aeiou]"),
                   num_consonants = str_count(word, "[^aeiou]"))
print(df)

[38;5;246m# A tibble: 980 x 4[39m
   word         i num_vowels num_consonants
   [3m[38;5;246m<chr>[39m[23m    [3m[38;5;246m<int>[39m[23m      [3m[38;5;246m<int>[39m[23m          [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m a            1          1              0
[38;5;250m 2[39m able         2          2              2
[38;5;250m 3[39m about        3          3              2
[38;5;250m 4[39m absolute     4          4              4
[38;5;250m 5[39m accept       5          2              4
[38;5;250m 6[39m account      6          3              4
[38;5;250m 7[39m achieve      7          4              3
[38;5;250m 8[39m across       8          2              4
[38;5;250m 9[39m act          9          1              2
[38;5;250m10[39m active      10          3              3
[38;5;246m# … with 970 more rows[39m


In [25]:
df <- df %>% # compute a new column having proportion of vowels in the words
            mutate(vowel_prop = num_vowels / str_length(word)) 
print(df)

[38;5;246m# A tibble: 980 x 5[39m
   word         i num_vowels num_consonants vowel_prop
   [3m[38;5;246m<chr>[39m[23m    [3m[38;5;246m<int>[39m[23m      [3m[38;5;246m<int>[39m[23m          [3m[38;5;246m<int>[39m[23m      [3m[38;5;246m<dbl>[39m[23m
[38;5;250m 1[39m a            1          1              0      1    
[38;5;250m 2[39m able         2          2              2      0.5  
[38;5;250m 3[39m about        3          3              2      0.6  
[38;5;250m 4[39m absolute     4          4              4      0.5  
[38;5;250m 5[39m accept       5          2              4      0.333
[38;5;250m 6[39m account      6          3              4      0.429
[38;5;250m 7[39m achieve      7          4              3      0.571
[38;5;250m 8[39m across       8          2              4      0.333
[38;5;250m 9[39m act          9          1              2      0.333
[38;5;250m10[39m active      10          3              3      0.5  
[38;5;246m# … with

In [26]:
filter(df, str_detect(word, "a"), str_detect(word, "e"), str_detect(word, "i"),
       str_detect(word, "o"), str_detect(word, "u")) # words containing all vowels

word,i,num_vowels,num_consonants,vowel_prop
<chr>,<int>,<int>,<int>,<dbl>


In [27]:
# words with at least 4 different vowels
filter(df, str_detect(word, "a") + str_detect(word, "e") + str_detect(word, "i") +
       str_detect(word, "o") + str_detect(word, "u") >= 4)

word,i,num_vowels,num_consonants,vowel_prop
<chr>,<int>,<int>,<int>,<dbl>
absolute,4,4,4,0.5
appropriate,48,5,6,0.4545455
associate,57,5,4,0.5555556
authority,61,4,5,0.4444444
colleague,166,5,4,0.5555556
continue,186,4,4,0.5
encourage,268,5,4,0.5555556
introduce,431,4,5,0.4444444
organize,585,4,4,0.5
previous,644,4,4,0.5


# Extract matches

In [28]:
v <- "[aeiou]"
c <- "[^aeiou]"
vvcv <- str_c(v, v, c, v)
print(vvcv)

[1] "[aeiou][aeiou][^aeiou][aeiou]"


In [29]:
(vvcv_words <- str_subset(words, vvcv)) # words containing the pattern vvcv

In [30]:
str_extract(vvcv_words, vvcv) # the patterns themselves

In [31]:
cvvc <- str_c(c, v, v, c)
print(cvvc)

[1] "[^aeiou][aeiou][aeiou][^aeiou]"


In [32]:
(cvvc_words <- str_subset(words, cvvc)) # words containing the pattern cvvc

In [33]:
str_extract(cvvc_words, cvvc) # the patterns themselves

In [34]:
(q_words <- str_subset(words, "^q"))

In [35]:
str_extract(q_words, v) # extracts only the first match

In [36]:
str_extract_all(q_words, v) # extracts all matches

Setting `simplify = TRUE` returns a matrix with shorter matches expanded to match the length of the longest

In [37]:
str_extract_all(q_words, v, simplify = TRUE)

0,1,2,3
u,a,i,
u,a,e,
u,e,i,o
u,i,,
u,i,,
u,i,e,
u,i,e,
