In this notebook, we will learn how to use regular expression to solve problems such as:

* [Detect matches](#Detect-matches)

In [2]:
library(tidyverse)
library(stringr)

Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------
filter(): dplyr, stats
lag():    dplyr, stats


# Detect matches

In [4]:
x <- c("x-ray", "something", "xylophone", "xenophobia", "nothing")
str_detect(x, "^x") # which strings start with x?

In [8]:
head(words, 10) # words is a predefined character vector 

In [19]:
sum(str_detect(words, "^a")) # no. of words that start with a

In [21]:
# for every letter, find how many words start with it
for (let in letters) {
    print(sprintf("%d words start with %s", sum(str_detect(words, str_c("^", let))), let))
}

[1] "65 words start with a"
[1] "58 words start with b"
[1] "83 words start with c"
[1] "43 words start with d"
[1] "45 words start with e"
[1] "54 words start with f"
[1] "23 words start with g"
[1] "38 words start with h"
[1] "25 words start with i"
[1] "6 words start with j"
[1] "9 words start with k"
[1] "45 words start with l"
[1] "45 words start with m"
[1] "23 words start with n"
[1] "28 words start with o"
[1] "72 words start with p"
[1] "7 words start with q"
[1] "46 words start with r"
[1] "119 words start with s"
[1] "65 words start with t"
[1] "12 words start with u"
[1] "8 words start with v"
[1] "53 words start with w"
[1] "0 words start with x"
[1] "6 words start with y"
[1] "0 words start with z"


In [23]:
sum(str_detect(words, "^[^aeiou]*$")) # no. of words with no vowels

In [25]:
words[str_detect(words, "^[^aeiou]*$")] # logical subsetting to get the words themselves

In [26]:
str_subset(words, "^[^aeiou]*$") # str_subset is a convenient shortcut

In [29]:
str_subset(words, "^[aeiou]*$") # words with only vowels

In [30]:
df <- tibble(
  word = words, 
  i = seq_along(word)
)

In [31]:
print(df)

# A tibble: 980 x 2
       word     i
      <chr> <int>
 1        a     1
 2     able     2
 3    about     3
 4 absolute     4
 5   accept     5
 6  account     6
 7  achieve     7
 8   across     8
 9      act     9
10   active    10
# ... with 970 more rows


In [33]:
df %>%
    filter(str_detect(word, "^q")) # can use str_detect with the dplyr verb filter

word,i
quality,665
quarter,666
question,667
quick,668
quid,669
quiet,670
quite,671


In [34]:
str_view_all("xyxyxyx", "xyx") # not how matches don't overlap

In [35]:
str_count("xyxyxyx", "xyx") # gives the number of matches

In [38]:
df <- df %>% # mutate with str_count
            mutate(num_vowels = str_count(word, "[aeiou]"),
                   num_consonants = str_count(word, "[^aeiou]"))
print(df)

# A tibble: 980 x 4
       word     i num_vowels num_consonants
      <chr> <int>      <int>          <int>
 1        a     1          1              0
 2     able     2          2              2
 3    about     3          3              2
 4 absolute     4          4              4
 5   accept     5          2              4
 6  account     6          3              4
 7  achieve     7          4              3
 8   across     8          2              4
 9      act     9          1              2
10   active    10          3              3
# ... with 970 more rows


In [40]:
df <- df %>% # compute a new column having proportion of vowels in the words
            mutate(vowel_prop = num_vowels / str_length(word)) 
print(df)

# A tibble: 980 x 5
       word     i num_vowels num_consonants vowel_prop
      <chr> <int>      <int>          <int>      <dbl>
 1        a     1          1              0  1.0000000
 2     able     2          2              2  0.5000000
 3    about     3          3              2  0.6000000
 4 absolute     4          4              4  0.5000000
 5   accept     5          2              4  0.3333333
 6  account     6          3              4  0.4285714
 7  achieve     7          4              3  0.5714286
 8   across     8          2              4  0.3333333
 9      act     9          1              2  0.3333333
10   active    10          3              3  0.5000000
# ... with 970 more rows


In [41]:
filter(df, str_detect(word, "a"), str_detect(word, "e"), str_detect(word, "i"),
       str_detect(word, "o"), str_detect(word, "u")) # words containing all vowels

word,i,num_vowels,num_consonants,vowel_prop


In [43]:
# words with at least 4 different vowels
filter(df, str_detect(word, "a") + str_detect(word, "e") + str_detect(word, "i") +
       str_detect(word, "o") + str_detect(word, "u") >= 4)

word,i,num_vowels,num_consonants,vowel_prop
absolute,4,4,4,0.5
appropriate,48,5,6,0.4545455
associate,57,5,4,0.5555556
authority,61,4,5,0.4444444
colleague,166,5,4,0.5555556
continue,186,4,4,0.5
encourage,268,5,4,0.5555556
introduce,431,4,5,0.4444444
organize,585,4,4,0.5
previous,644,4,4,0.5
