In [None]:
library(dplyr)
library(tidytext)
library(tokenizers)
library(tidyr)
library(ggplot2)
library(tm)
library(wordcloud)

In [None]:
listAtas <- list.files(path="../atas", pattern=".txt", all.files=TRUE, full.names=TRUE)

print(paste(length(listAtas),"atas"))

In [None]:
corpus <- c()
for(ata in listAtas){
    lines <- readLines(con = ata, encoding = "UTF-8")
    lines <- paste(lines, collapse = " ")
    corpus <- c(corpus,lines)
}
print(paste(length(corpus),"atas"))

# Corpus Characteristics 

In [None]:
corpusJoined <- paste(corpus, collapse = ' ')
corpusJoinedWithoutPunctuation <- tm::removePunctuation(corpusJoined)
corpusWordTokenized <- tokenizers::tokenize_words(corpusJoined, simplify = TRUE)
corpusWordTokenizedWithoutPunctuation <- tokenizers::tokenize_words(corpusJoinedWithoutPunctuation, simplify = TRUE)
corpusJoinedWithoutSpaces <- gsub(pattern = " ", replacement = "", x = corpusJoined)
corpusSentences <- tokenizers::tokenize_sentences(corpusJoined,simplify = TRUE)

In [None]:
print(paste0("Number of characters with spaces: ", nchar(corpusJoined)))
print(paste0("Number of characters without spaces: ", nchar(corpusJoinedWithoutSpaces)))
print(paste0("Number of words: ",length(corpusWordTokenizedWithoutPunctuation)))
print(paste0("Number of sentences: ", length(corpusSentences)))
print(paste0("Number of characters per words: ", nchar(corpusJoinedWithoutSpaces)/length(corpusWordTokenizedWithoutPunctuation)))
print(paste0("Number of words per sentence: ", length(corpusWordTokenizedWithoutPunctuation)/length(corpusSentences)))

# Frequencies 

In [None]:
df <- data.frame(sentence = corpus, stringsAsFactors = FALSE)

In [None]:
Mystopwords <- c('ainda','ante','p','r','sobre', 'janeiro','fevereiro','março','abril','maio','junho','julho','agosto','setembro','outubro','novembro','dezembro','mês','meses','ano','anos', as.character(0:9),tm::stopwords('pt'))


## Character frequency 

In [None]:
charFreq <- df %>%
  unnest_tokens(character, sentence, token = "characters") %>%
  count(character, sort = TRUE) %>%
  ungroup()

In [None]:
number_of_chars <- 20
charFreq[1:number_of_chars,]

In [None]:
ggplot(charFreq[1:number_of_chars,],aes(x = reorder(character,n), n)) +
  geom_bar(stat="identity") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  ylab("Frequency") + xlab("characters") + ggtitle("Characters frequencies") +
  theme_bw() +
  coord_flip()

## Word Frequency

In [None]:
numberOfWords <- 20

### With stop words

In [None]:
wordsFreq <- df %>%
  unnest_tokens(word, sentence) %>%
  count(word, sort = TRUE) %>%
  ungroup()

In [None]:
wordsFreq[1:numberOfWords,]

In [None]:
ggplot(wordsFreq[1:numberOfWords,],aes(x = reorder(word,n), n)) +
  geom_bar(stat="identity") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  ylab("Frequency") + xlab("words") + ggtitle("Word frequencies with stop words") +
  theme_bw() +  
  coord_flip()

### Wordcloud with stop words

In [None]:
wordcloud(words = wordsFreq$word,freq = wordsFreq$n, min.freq = 1000, random.order=FALSE, max.words=1000, rot.per=0, colors=brewer.pal(6, "Dark2"),scale = c(5, 1.2))

### Without stop words

In [None]:
wordsFreq2 <- wordsFreq %>%
  filter(!word %in% Mystopwords)

In [None]:
wordsFreq2[1:numberOfWords,]

In [None]:
ggplot(wordsFreq2[1:numberOfWords,],aes(x = reorder(word,n), n)) +
  geom_bar(stat="identity") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  ylab("Frequency") + xlab("words") + ggtitle("Word frequencies without stop words") +
  theme_bw() +
  coord_flip()

### Wordcloud without stop words

In [None]:
wordcloud(words = wordsFreq2$word,freq = wordsFreq2$n, min.freq = 1000, random.order=FALSE, max.words=1000, rot.per=0, colors=brewer.pal(6, "Dark2"),scale = c(4, 0.8))

## Bigram Frequency
 

In [None]:
numberOfBigram <- 20

### With stop words

In [None]:
bigramFreq <- df %>%
  unnest_tokens(word, sentence, token = "ngrams", n = 2) %>%
  count(word, sort = TRUE) %>%
  ungroup()

In [None]:
bigramFreq[1:numberOfBigram,]

In [None]:
ggplot(bigramFreq[1:numberOfBigram,],aes(x = reorder(word,n), n)) +
  geom_bar(stat="identity") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  ylab("Frequency") + xlab("bigrams") + ggtitle("Bigram frequencies with stop words") +
  theme_bw() +
  coord_flip()

### bigram cloud with stop words

In [None]:
wordcloud(words = bigramFreq$word,freq = bigramFreq$n, min.freq = 1000, random.order=FALSE, max.words=1000, rot.per=0, colors=brewer.pal(6, "Dark2"),scale = c(3.8, 0.6))

### Without stopwords

In [None]:
bigramFreq2 <- bigramFreq %>% select(word,n) %>% 
  separate(word, c("word1", "word2"), sep = " ")

bigramFreq2 <- bigramFreq2 %>%
  filter(!word1 %in% Mystopwords) %>%
  filter(!word2 %in% Mystopwords)

bigramFreq2 <- bigramFreq2 %>% select(word1, word2, n) %>% 
  unite(word, word1, word2, sep = " ")

In [None]:
bigramFreq2[1:numberOfBigram,]

In [None]:
ggplot(bigramFreq2[1:numberOfBigram,],aes(x = reorder(word,n), n)) +
  geom_bar(stat="identity") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  ylab("Frequency") + xlab("bigrams") + ggtitle("Bigram frequencies without stop words") +
  theme_bw() +
  coord_flip()

### bigrams cloud without stop words

In [None]:
wordcloud(words = bigramFreq2$word,freq = bigramFreq2$n, min.freq = 200, random.order=FALSE, max.words=500, rot.per=0, colors=brewer.pal(6, "Dark2"),scale = c(3, 0.5))

## Trigram Frequency

In [None]:
numberOfTrigram <- 20

### With stop words

In [None]:
trigramFreq <- df %>%
  unnest_tokens(word, sentence, token = "ngrams", n = 3) %>%
  count(word, sort = TRUE) %>%
  ungroup()

In [None]:
trigramFreq[1:numberOfTrigram,]

In [None]:
ggplot(trigramFreq[1:numberOfTrigram,],aes(x = reorder(word,n), n)) +
  geom_bar(stat="identity") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  ylab("Frequency") + xlab("trigrams") + ggtitle("Trigram frequencies with stop words") +
  theme_bw() +
  coord_flip()

### Trigrams cloud with stop words

In [None]:
wordcloud(words = trigramFreq$word,freq = trigramFreq$n, min.freq = 100, random.order=FALSE, max.words=250, rot.per=0, colors=brewer.pal(6, "Dark2"),scale = c(2.5, 0.3))

### Without stop words

In [None]:
trigramFreq2 <- trigramFreq %>% select(word,n) %>% 
  separate(word, c("word1", "word2", "word3"), sep = " ")

trigramFreq2 <- trigramFreq2 %>%
  filter(!word1 %in% Mystopwords) %>%
  filter(!word2 %in% Mystopwords) %>% 
  filter(!word3 %in% Mystopwords)

trigramFreq2 <- trigramFreq2 %>% select(word1, word2, word3, n) %>% 
  unite(word, word1, word2, word3, sep = " ")

In [None]:
trigramFreq2[1:numberOfTrigram,]

In [None]:
ggplot(trigramFreq2[1:numberOfTrigram,],aes(x = reorder(word,n), n)) +
  geom_bar(stat="identity") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  ylab("Frequency") + xlab("trigrams") + ggtitle("Trigram frequencies without stop words") +
  theme_bw() +
  coord_flip()

In [None]:
wordcloud(words = trigramFreq2$word,freq = trigramFreq2$n, min.freq = 50, random.order=FALSE, max.words=200, rot.per=0, colors=brewer.pal(6, "Dark2"),scale = c(2.3, 0.3))