# Chapter 10. Proccesing text
## Notebook for R

### 10.1 Reading and cleaning text

In [1]:
library(glue)
library(magrittr)
my_wiki_text = "<p><b>Communication</b> (from Latin <i>communicare</i>, meaning to share)"
glue(my_wiki_text)

my_wiki_text %<>% gsub("<p>", " ", .) %>% gsub("<b>", " ", .) %>% gsub("</b>", " ", .) %>% gsub("<i>", " ", .) %>% gsub("</i>", " ", .) 
my_wiki_text_2 = gsub('[[:punct:]]','',my_wiki_text) #Remove punctuation
my_wiki_text_2 = tolower(my_wiki_text_2) #Convert to lower case
my_wiki_text_2 = trimws(gsub("\\s+", " ", my_wiki_text_2)) #Remove double spaces 
glue(my_wiki_text_2)

In [2]:
library(stringi)
my_wiki_text_3 = stri_replace_all(my_wiki_text, "", regex = "<.+?>")
my_wiki_text_3 = stri_trim(my_wiki_text_3)
glue(my_wiki_text_3)

In [3]:
my_wiki_text_4 = gsub(pattern = "<.*?>", replacement = "", x = my_wiki_text)
glue(my_wiki_text_4)

In [4]:
tweets = "<@born_in_america: My second favorite color is green, I must acknowlege my friends for that!>, <@born_in_britain: My second favourite colour is red, I must aknowledge my friends for that!>"
tweets_unified = gsub(pattern = "acknowlege|aknowledge" , replacement = "acknowledge", x = tweets)
tweets_unified = gsub(pattern = "col.+?r", replacement = "color", x = tweets_unified)
tweets_unified = gsub(pattern = "fav.+?rite" , replacement = "favorite", x = tweets_unified)
glue(tweets)
glue(tweets_unified)

In [5]:
users = unlist(strsplit(tweets, " "))
users = gsub("[^[:alnum:]@_]", "", users[grep("(^|[^@\\w])@(\\w{1,15})\\b", users)])
print(users)

[1] "@born_in_america" "@born_in_britain"


In [130]:
library(quanteda)
tokens = tokens(tweets)
print(tokens)
glue(ntoken(tweets))

filtered_tokens = tokens_remove(tokens(tweets, remove_punct = TRUE), stopwords("english"))
print(filtered_tokens)
glue(ntoken(filtered_tokens))

tokens from 1 document.
text1 :
 [1] "<"                "@born_in_america" ":"                "My"              
 [5] "second"           "favorite"         "color"            "is"              
 [9] "green"            ","                "I"                "must"            
[13] "acknowlege"       "my"               "friends"          "for"             
[17] "that"             "!"                ">"                ","               
[21] "<"                "@born_in_britain" ":"                "My"              
[25] "second"           "favourite"        "colour"           "is"              
[29] "red"              ","                "I"                "must"            
[33] "aknowledge"       "my"               "friends"          "for"             
[37] "that"             "!"                ">"               



tokens from 1 document.
text1 :
 [1] "@born_in_america" "second"           "favorite"         "color"           
 [5] "green"            "must"             "acknowlege"       "friends"         
 [9] "@born_in_britain" "second"           "favourite"        "colour"          
[13] "red"              "must"             "aknowledge"       "friends"         



In [139]:
tokens_wordstem(tokens("Buildings Builds awesome"))

tokens from 1 document.
text1 :
[1] "Build"  "Build"  "awesom"


In [222]:
library(spacyr)
spacy_initialize( model = "en_core_web_sm", python_executable = NULL, virtualenv = NULL,
condaenv = NULL, ask = FALSE, refresh_settings = FALSE, save_profile = FALSE, check_env = TRUE, entity = TRUE)
doc = spacy_parse(tweets)
glue("Number of tokens: ", nrow(doc))

spaCy is already initialized



NULL

“lemmatization may not work properly in model 'en_core_web_sm'”


In [223]:
print(doc[8,4:6])
print(doc[15,4:6])

  token lemma pos
8    is    be AUX
     token  lemma  pos
15 friends friend NOUN


In [224]:
noun_chunks = spacy_extract_nounphrases(tweets)
glue("Number of noun chunks: ", nrow(noun_chunks))
glue("Noun chunk 4: ", noun_chunks[4, "text"])

In [242]:
docp = spacy_parse("My second favorite color is green", dependency = TRUE)
print(select(docp,token,dep_rel))

“lemmatization may not work properly in model 'en_core_web_sm'”


     token dep_rel
1       My    poss
2   second    amod
3 favorite    amod
4    color   nsubj
5       is    ROOT
6    green   acomp


In [249]:
headline= spacy_parse("Madrid will host Olympic Games in 2032, Pedro Sanchez announced")
print(entity_extract(headline, type = "all"))

“lemmatization may not work properly in model 'en_core_web_sm'”


  doc_id sentence_id        entity entity_type
1  text1           1        Madrid         GPE
2  text1           1 Olympic_Games         ORG
3  text1           1          2032        DATE
4  text1           1 Pedro_Sanchez      PERSON
