In [None]:
library(data.table)
library(tidyverse)
library(htm2txt)
library(pdftools)
library(textreadr)
library(magrittr)

In [None]:
options(repr.matrix.max.rows=100, repr.matrix.max.cols=40) # for limiting the number of top and bottom rows of tables printed 

In [None]:
data(presidential_debates_2012, package = "textreadr")
head(presidential_debates_2012, 20)

In [None]:
#  Quantifier (* and +) --> find "Medicare" followed by any number of words
medicare_dialogues <- presidential_debates_2012 %>%
  filter(str_detect(dialogue, "Medicare\\s+.*")) %>%
  select(dialogue)

medicare_dialogues

In [None]:
# ? Quantifier --> find dialogues that may or may not contain "health"
health_dialogues <- presidential_debates_2012 %>%
  filter(str_detect(dialogue, "health?")) %>%
  select(dialogue)

health_dialogues

In [None]:
# . Qualifier --> find "AARP" followed by any character
the_dialogues <- presidential_debates_2012 %>%
  filter(str_detect(dialogue, "AARP.")) %>%
  select(dialogue)

the_dialogues

In [None]:
# Extract decimal numbers from the 'tot' column where the number starts with 1 or 2
extracted_decimals <- presidential_debates_2012 %>%
  filter(str_detect(tot, "^1|^2")) %>%
  select(tot)
head(extracted_decimals)

In [None]:
# Find all roles that contain the letter 'a' and are 8 characters long
filtered_roles <- presidential_debates_2012 %>%
  filter(str_detect(role, "^.{9}$") & str_detect(role, "a")) %>%
  select(role)
head(filtered_roles)

In [None]:
# Replace all occurrences of the word "president" or "governor" with "Leader"
replaced_dialogue <- presidential_debates_2012 %>%
  mutate(dialogue = str_replace_all(dialogue, "\\b(president|governor)\\b", "Leader")) %>%
  select(dialogue)
head(replaced_dialogue)

In [None]:
# Replace whitespace with underscores
whitespace_example <- presidential_debates_2012 %>%
  mutate(dialogue = str_replace_all(dialogue, "\\s", "_")) %>%
  select(dialogue)

head(whitespace_example)

In [None]:
# Extract all vowels
vowel_example <- presidential_debates_2012 %>%
  mutate(dialogue = str_extract_all(dialogue, "[aeiouAEIOU]")) %>%
  select(dialogue)

head(vowel_example)

In [None]:
# Anchors on Dialogue ---> Extract dialogues that start with 'I' and end with 'you.'
specific_dialogue <- presidential_debates_2012 %>%
  filter(str_detect(dialogue, "^I.*you\\.$")) %>%
  select(dialogue)
head(specific_dialogue)

In [None]:
# Extract lines starting with "ROMNEY"
line_start_example <- presidential_debates_2012 %>%
  mutate(dialogue = str_extract_all(dialogue, "^ROMNEY.*")) %>%
  select(dialogue)

head(line_start_example)

In [None]:
# Extract lines ending with a period
line_end_example <- presidential_debates_2012 %>%
  mutate(dialogue = str_extract_all(dialogue, ".*\\.$")) %>%
  select(dialogue)

head(line_end_example)

In [None]:
# Extract mentions of "Medicare" or "insurance"
alternation_example <- presidential_debates_2012 %>%
  mutate(dialogue = str_extract_all(dialogue, "\\b(Medicare|insurance)\\b")) %>%
  select(dialogue)

head(alternation_example)

In [None]:
# Extract numbers preceded by "dollar"
lookbehind_example <- presidential_debates_2012 %>%
  mutate(dialogue = str_extract_all(dialogue, "(?<=\\bdollar\\s)\\w+")) %>%
  select(dialogue)

head(lookbehind_example)

In [None]:
# Extract words followed by "plan"
lookahead_example <- presidential_debates_2012 %>%
  mutate(dialogue = str_extract_all(dialogue, "\\b\\w+(?=\\s+plan)")) %>%
  select(dialogue)

head(lookahead_example)