# Introduction to NLP with Quanteda

In [None]:
library('quanteda')

In [None]:
text <- '
Call me Ishmael. Some years ago—never mind how long precisely—having little
or no money in my purse, and nothing particular to interest me on shore, 
I thought I would sail about a little and see the watery part of the world.
'

In [None]:
print(text)

## Tokenization

In [None]:
tokens(text)

In [None]:
word_toks <- tokens(text)
word_toks[[1]]

In [None]:
sent_toks <- tokens(text, what = 'sentence')

In [None]:
sent_toks

In [None]:
word_toks[[1]]

In [None]:
words = list()

for (i in sent_toks[[1]]) {
    for (w in tokens(i)[[1]]) {
        words <- append(words,w)
    }
}

words

In [None]:
sent_toks <- tokens(c('Call me Ishmael.  That\'s my name.',
                      'Call me Ben.  That used to be my name.'),
                      what = 'sentence')

In [None]:
sent_toks

In [None]:
words = list()

for (i in (1:length(sent_toks))) {
    for (s in (1:length(sent_toks[[i]]))) {
        print(sent_toks[[i]][[s]])
    }
}

words

## Stopword removal

In [None]:
stopwords("english")

In [None]:
word_toks[[1]]

In [None]:
words = list()

for (i in word_toks[[1]]) {
    if (!(i %in% stopwords("english"))) {
        print(i)
    }
}

words

In [None]:
word_toks_v2 <- tokens(text) %>% tokens_remove(stopwords("english")) 
word_toks_v2[[1]]

In [None]:
word_toks_v2 <- tokens(text, remove_punct = TRUE) %>% tokens_remove(stopwords("english")) 
word_toks_v2[[1]]

In [None]:
word_toks_v2 <- tokens(text, remove_punct = TRUE) %>% 
                tokens_remove(stopwords("english")) %>% 
                tokens_select('p+', selection = "remove", valuetype = 'regex')
word_toks_v2[[1]]

In [None]:
word_toks_v2 <- tokens(text, remove_punct = TRUE) %>% 
                tokens_remove(stopwords("english")) %>% 
                tokens_select('p+', selection = "remove", valuetype = 'regex') %>%
                tokens_select('.*\u2014.*', selection = "keep", valuetype = 'regex')
word_toks_v2[[1]]

In [None]:
word_toks_v3 <- tokens(text, remove_punct = TRUE)
word_toks_v3[[1]]

In [None]:
word_toks_v3 <- tokens(text, remove_punct = TRUE) %>% 
                tokens_split(separator = '\u2014')
word_toks_v3[[1]]

In [None]:
kwic(word_toks_v3, pattern = 'long')

In [None]:
kwic(word_toks_v3, 
     pattern = '^[s]+', 
     valuetype = 'regex')

In [None]:
kwic(word_toks_v3, 
     pattern = '^[s]+', 
     valuetype = 'regex',
     case_insensitive = FALSE)

In [None]:
kwic(word_toks_v3, 
     pattern = 's*', 
     valuetype = 'glob')

## N-grams

In [None]:
twograms <- tokens_ngrams(word_toks_v3, n=2)
twograms[[1]]

In [None]:
threegrams <- tokens_ngrams(word_toks_v3, n=3)
threegrams[[1]]

In [None]:
word_toks_v3 <- tokens(text, remove_punct = TRUE) %>% 
                tokens_remove(stopwords("english"))
threegrams <- tokens_ngrams(word_toks_v3, n=3)
threegrams[[1]]

In [None]:
stopwords("english")

In [None]:
s <- stopwords("english")
keepers <- c('me','myself','I')
s[!(s %in% keepers)]

In [None]:
s <- stopwords("english")
keepers <- c('me','myself','i')
s[!(s %in% keepers)]

In [None]:
s <- stopwords("english")
keepers <- c('me','myself','i')
s <- s[!(s %in% keepers)]

word_toks_v3 <- tokens(text, remove_punct = TRUE) %>% 
                tokens_remove(s)

threegrams <- tokens_ngrams(word_toks_v3, n=3)
threegrams[[1]]

In [None]:
s <- stopwords("english")
losers <- c('ishmael')
s <- append(s,losers)
keepers <- c('me','myself','i')
s <- s[!(s %in% keepers)]
print(s)

word_toks_v3 <- tokens(text, remove_punct = TRUE) %>% 
                tokens_remove(s)

threegrams <- tokens_ngrams(word_toks_v3, n=3)
threegrams[[1]]

What????? Isn't Ishmael the wrong case?
* tokens_remove has case_insensitive = TRUE by default

In [None]:
s <- stopwords("english")
losers <- c('ishmael')
s <- append(s,losers)
keepers <- c('me','myself','i')
s <- s[!(s %in% keepers)]

word_toks_v3 <- tokens(text, remove_punct = TRUE) %>% 
                tokens_remove(s, case_insensitive = TRUE)
                # tokens_remove(s, case_insensitive = FALSE)

threegrams <- tokens_ngrams(word_toks_v3, n=3)
threegrams[[1]]

## Stemming and Tagging

In [None]:
text2 <- 'Ishmael sailed because sailing and wanting to sail was in his blood.'

In [None]:
tokens(text2)

In [None]:
tokens(text2) %>% 
tokens_wordstem()

In [None]:
tokens(text2, remove_punct = TRUE) %>%
tokens_remove(stopwords("english")) %>% 
tokens_wordstem()

Part-of-speech

We'll turn to spacy and its linguistic capabilities
* https://spacy.io/usage/linguistic-features

In [None]:
library("spacyr")
packageVersion("spacyr")

https://www.rdocumentation.org/packages/spacyr/versions/1.2.1

In [None]:
# this uses "en_core_web_sm" by default
spacy_initialize()

In [None]:
text2

In [None]:
txtparsed <- spacy_parse(text2, tag = TRUE, pos = TRUE)

In [None]:
txtparsed

In [None]:
txt2 <- c(doc1 = "The fast cat catches mice.\\nThe quick brown dog jumped.", 
          doc2 = "This is the second document.",
          doc3 = "This is a \\\"quoted\\\" text." )
spacy_parse(txt2, entity = TRUE, dependency = TRUE)

In [None]:
txt3 <- "We analyzed the Supreme Court with three natural language processing tools." 
spacy_parse(txt3, entity = TRUE, nounphrase = TRUE)
spacy_parse(txt3, additional_attributes = c("like_num", "is_punct"))

In [None]:
s <- 'Once upon a time there was a cat.  It was black and fluffy.'

In [None]:
spacy_tokenize(s)

In [None]:
spacy_parse(s)

In [None]:
s_reduced <- spacy_tokenize(s, 
                            remove_punct=TRUE, 
                            remove_separators=TRUE)

In [None]:
spacy_parse(s_reduced)

In [None]:
s_reduced <- spacy_tokenize(s, 
                            remove_punct=TRUE, 
                            remove_separators=TRUE)

... getting a little ahead of ourselves, but....

In [None]:
cat_dfm <- dfm(s_reduced)

Quanteda objects need to be reorganized to play nicely as inputs to functions from other NLP libraries, and vice versa

In [None]:
s_reduced <- tokens(s,
                    remove_punct=TRUE,
                    remove_separators=TRUE)

In [None]:
s_reduced

In [None]:
cat_dfm <- dfm(s_reduced)

In [None]:
cat_dfm

In [None]:
catnews <- c('Once upon a time there was a cat.  It was black and fluffy.',
             'Ben has two cats named Archer and Lana, plus one blond Chihuahua Pearl.',
             'Archer is the real head of the household, as both Pearl and Lana can attest.')

In [None]:
spacy_parse(catnews)

In [None]:
txtparsed <- spacy_parse(catnews)

In [None]:
(nouns <- with(txtparsed, subset(token, pos == "NOUN")))

In [None]:
(propernouns <- with(txtparsed, subset(token, pos == "PROPN")))

In [None]:
# note: the following will give an error:
# cat_dfm <- dfm(txtparsed)
# what we really want is:
(cat_dfm <- dfm(tokens(catnews)))

In [None]:
cattoks <- tokens(catnews, remove_punct=TRUE)
catdfm <- dfm(cattoks) %>%
            dfm_remove(stopwords('en'))

In [None]:
catdfm

In [None]:
dfm_select(catdfm, pattern = propernouns)

In [None]:
library("quanteda.textplots")

In [None]:
textplot_wordcloud(catdfm)

In [None]:
# :(
catnews <- c('Once upon a time there was a cat.  
             It was black, oh so black, and fluffy, oh so fluffy.',
             'Ben has two cats (neither black) named Archer and Lana, 
             plus one (not black but blond) Chihuahua Pearl.',
             'Archer is the real head of the household, even though oh so (OH SO) fluffy,
             as both Pearl and Lana can attest.')

In [None]:
cattoks <- tokens(catnews, remove_punct=TRUE)
catdfm <- dfm(cattoks) %>%
            dfm_remove(stopwords('en'))

In [None]:
textplot_wordcloud(catdfm)

In [None]:
inaugural <- corpus_subset(data_corpus_inaugural, Year <= 1826)

In [None]:
dfm_inaug <- tokens(inaugural, remove_punct = TRUE) %>% 
    dfm() %>%
    dfm_remove(stopwords('english')) %>%
    dfm_trim(min_termfreq = 10)

In [None]:
textplot_wordcloud(dfm_inaug)

In [None]:
set.seed(100)
textplot_wordcloud(dfm_inaug)

In [None]:
textplot_wordcloud(dfm_inaug, #min_count = 10,
     color = c('red', 'pink', 'green', 'purple', 'orange', 'blue'))

In [None]:
kwic(tokens(inaugural), pattern = "american") %>%
    textplot_xray()

In [None]:
textplot_xray(
    kwic(tokens(inaugural), pattern = "american"),
    kwic(tokens(inaugural), pattern = "government"),
    kwic(tokens(inaugural), pattern = "war")
)

In [None]:
textplot_xray(
    kwic(tokens(inaugural), pattern = "american"),
    kwic(tokens(inaugural), pattern = "government"),
    kwic(tokens(inaugural), pattern = "war"),
    scale = 'absolute'
)

In [None]:
library("quanteda.textstats")

In [None]:
dfm_inaug

In [None]:
features_dfm_inaug <- textstat_frequency(dfm_inaug, n = 25)

In [None]:
head(features_dfm_inaug)

In [None]:
features_dfm_inaug$feature <- with(features_dfm_inaug, reorder(feature, -frequency))

In [None]:
head(features_dfm_inaug)

In [None]:
library(ggplot2)

In [None]:
ggplot(features_dfm_inaug, aes(x = feature, y = frequency)) +
    geom_point() + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

In [None]:
dfm_inaug@docvars

In [None]:
freq_weight <- textstat_frequency(dfm_inaug, 
                                  n = 10, 
                                  groups = dfm_inaug$President)

In [None]:
ggplot(data = freq_weight, 
       aes(x = nrow(freq_weight):1, y = frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() + 
scale_x_continuous(breaks = nrow(freq_weight):1,
                labels = freq_weight$feature)
