In [3]:
library(tidyverse)
library(tidytext)

In [4]:
#install.packages('tidytext')

In [5]:
# E-mail corpus consists of nested folders per user with e-mails as text files
# Create list of all available e-mails
emails <- list.files("/data/enron/", full.names = T, recursive = T)
# Filter by inbox only
emails <- emails[grep("/inbox", emails)]

In [6]:
stripMessage <- function (fileName){
    list(
    paste(
    readLines(fileName, warn = FALSE),
        collapse = ' ')
        )
    }

In [50]:
df <- as_tibble(do.call(rbind, sapply(emails, stripMessage)))

In [51]:
df %>%
  unnest_tokens(word, V1) %>%
  anti_join(stop_words) %>%
  filter(
    !str_detect(word, pattern = "[[:digit:]]"), # removes any words with numeric digits
    !str_detect(word, pattern = "[[:punct:]]"), # removes any remaining punctuations
    !str_detect(word, pattern = "(.)\\1{2,}"),  # removes any words with 3 or more repeated letters
    !str_detect(word, pattern = "\\b(.)\\b")    # removes any remaining single letter words
    ) %>%
  mutate(word = corpus::text_tokens(word, stemmer = "en") %>% unlist()) %>% # add stemming process
  count(word) %>% 
  group_by(word) %>%
  summarize(n = sum(n)) %>%
  filter(n >= 100) %>%
  arrange(desc(n))

Joining, by = "word"


word,n
<chr>,<int>
cn,935192
enron,564423
recipi,462334
ou,450111
na,449834
content,93160
messag,79046
subject,73920
origin,71246
pst,66963


In [53]:
tech_words <- c('font', 'mime', 'http', 'charset', 'src', 'href', 'type', 'text', 'encoding',
               'content', 'ascii', 'arial', 'helvetica', 'pst', 'thyme')

In [55]:
#ngram_list <- df %>%
df %>%
  unnest_tokens(bigram, V1, token = "ngrams", n = 2) %>%  
  separate(bigram, c("word1", "word2"), sep = " ") %>%               
  filter(
    !word1 %in% stop_words$word,                 # remove stopwords from both words in bi-gram
    !word2 %in% stop_words$word,
    !str_detect(word1, pattern = "[[:digit:]]"), # removes any words with numeric digits
    !str_detect(word2, pattern = "[[:digit:]]"),
    !str_detect(word1, pattern = "[[:punct:]]"), # removes any remaining punctuations
    !str_detect(word2, pattern = "[[:punct:]]"),
    !str_detect(word1, pattern = "(.)\\1{2,}"),  # removes any words with 3 or more repeated letters
    !str_detect(word2, pattern = "(.)\\1{2,}"),
    !str_detect(word1, pattern = "\\b(.)\\b"),   # removes any remaining single letter words
    !str_detect(word1, pattern = "\\b(.)\\b"),
    !word1 == word2,
    nchar(word1) >= 4,
    nchar(word2) >= 4,
    !word1 %in% tech_words,
    !word2 %in% tech_words
    ) %>%
  unite("bigram", c(word1, word2), sep = " ") %>%
  count(bigram) %>%
  filter(n >= 50) %>%
  arrange(desc(n))
#  pull(bigram)

bigram,n
<chr>,<int>
original message,21762
enron corp,8112
folder exmerge,7114
privileged inbox,6130
elink script,5371
nemec gerald,5097
dasovich jeff,4950
scheduled outages,4704
intended recipient,4675
players league,4443
