# Chapter 10. Proccesing text
## Notebook for R

### 10.1 Reading and cleaning text

In [1]:
install.packages(c("glue", "tidyverse"))

Installing packages into ‘/home/wva/R/x86_64-pc-linux-gnu-library/4.0’
(as ‘lib’ is unspecified)



In [2]:
library(glue)
library(tidyverse)

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.0     [32m✔[39m [34mdplyr  [39m 1.0.5
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mcollapse()[39m masks [34mglue[39m::collapse()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m   masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m      masks [34mstats[39m::lag()



In [3]:
text = "This is text."
glue("text is a {class(text)} of length {length(text)}")
glue("text[1]: {text[1]}")
glue("str_length(text): {str_length(text)}")
glue("str_sub(text, 6,7): {str_sub(text, 6,7)}")


In [4]:
words = c("These", "are", "words")
glue("words is a {class(words)} of length {length(words)}")
glue("words[1]: {words[1]}")
glue("words[2:3]: {paste(words[2:3], collapse=' ')}")

In [5]:
text = "    <b>Communication</b>    (from Latin communicare, meaning to share)  "
cleaned = text %>% 
  # remove HTML tags:
  str_replace_all("<b>", " ")  %>% 
  str_replace_all("</b>", " ")  %>% 
  # normalize white space 
  str_squish() %>%
  # lower case
  tolower()  %>% 
  # trim spaces at start and end
  trimws()

glue(cleaned)

In [6]:
text = "    <b>Communication</b>    (from Latin communicare, meaning to share)  "
cleaned = text %>% 
  # remove HTML tags:
  str_replace_all("<[^>]+>", " ")  %>% 
  # normalize white space 
  str_replace_all("\w{space}+", " ")  %>% 
  # trim spaces at start and end
  str_remove_all("^\\s+|\\s+$")

cleaned

ERROR: Error: '\w' is an unrecognized escape in character string starting ""\w"


In [7]:
sessionInfo()

R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=nl_NL.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=nl_NL.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=nl_NL.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=nl_NL.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_0.4.0   stringr_1.4.0   dplyr_1.0.5     purrr_0.3.4    
 [5] readr_1.4.0     tidyr_1.1.3     tibble_3.1.0    ggplot2_3.2.1  
 [9] tidyverse_1.3.0 glue_1.4.2     

loaded via a namespace (and not attached):
 [1] pbdZMQ_0.3-3     tidyselect_1.1.0 repr_1.1.0

In [6]:
library(tidyverse)
tweets = read_csv("http://cssbook.net/d/example_tweets.csv")
tweets = tweets %>% mutate(
    # identify tweets with hashtags
    has_tag=str_detect(text, "#\\w+"),
    # How many at-mentions are there?
    n_at = str_count(text, "(^|\\s)@\\w+"),
    # Extract first url
    url = str_extract(text, "(https?://\\S+)"),
    # Extract only plain text
    plain2 = str_replace_all(text, "(^|\\s)(@|#|https?://)\\S+", " ") %>% 
             str_replace_all("\\W+", " ")
    )

tweets

Parsed with column specification:
cols(
  id = [32mcol_double()[39m,
  text = [31mcol_character()[39m
)



id,text,has_tag,n_at,url,plain2
<dbl>,<chr>,<lgl>,<int>,<chr>,<chr>
1,RT: @john_doe https://example.com/news very interesting!,False,1,https://example.com/news,RT very interesting
2,tweet with just text,False,0,,tweet with just text
3,http://example.com/pandas #breaking #mustread,True,0,http://example.com/pandas,
4,@me and @myself #selfietime,True,2,,and


In [7]:
text = "apples, pears, oranges"
items = strsplit(text, ", ", fixed=T)[[1]]
items = str_split(text, "\\p{PUNCTUATION}\\s*")[[1]]
items = str_extract_all(text, "\\p{LETTER}+")[[1]]
print(items)
joined = str_c(items, collapse=" & ")
print(joined)

[1] "apples"  "pears"   "oranges"
[1] "apples & pears & oranges"


In [8]:
tags = tweets %>% mutate(tag=str_extract_all(tweets$text, "(#\\w+)")) %>% select(id, tag)
tags_long = tags  %>% unnest(tag)
left_join(tags_long, tweets)

Joining, by = "id"


id,tag,text,has_tag,n_at,url,plain2
<dbl>,<chr>,<chr>,<lgl>,<int>,<chr>,<chr>
3,#breaking,http://example.com/pandas #breaking #mustread,True,0,http://example.com/pandas,
3,#mustread,http://example.com/pandas #breaking #mustread,True,0,http://example.com/pandas,
4,#selfietime,@me and @myself #selfietime,True,2,,and


In [9]:
words = tweets %>% mutate(word=str_split(tweets$text, "\\W+")) %>% select(id, word)
words_long = words %>% unnest(word)
head(words_long)

id,word
<dbl>,<chr>
1,RT
1,john_doe
1,https
1,example
1,com
1,news


In [10]:
words_long %>% group_by(id) %>% summarize(joined=str_c(word, collapse="_"))

id,joined
<dbl>,<chr>
1,RT_john_doe_https_example_com_news_very_interesting_
2,tweet_with_just_text
3,http_example_com_pandas_breaking_mustread
4,_me_and_myself_selfietime
