## Task 4 - Prediction Model

The goal of this exercise is to build and evaluate your first predictive model. You will use the n-gram and backoff models you built in previous tasks to build and evaluate your predictive model. The goal is to make the model efficient and accurate.

**Tasks to accomplish**

- Build a predictive model based on the previous data modeling steps - you may combine the models in any way you think is appropriate.
- Evaluate the model for efficiency and accuracy - use timing software to evaluate the computational complexity of your model. Evaluate the model accuracy using different metrics like perplexity, accuracy at the first word, second word, and third word.

**Questions to consider**
- How does the model perform for different choices of the parameters and size of the model?
- How much does the model slow down for the performance you gain?
- Does perplexity correlate with the other measures of accuracy?
- Can you reduce the size of the model (number of parameters) without reducing performance?

### 0. Environment Settings

In [None]:
library(data.tree)
library(DiagrammeR)
library(dplyr)
library(ggplot2)
library(igraph)
library(influenceR)
library(plyr)
library(RColorBrewer)
library(SnowballC)
library(stopwords)
library(stringi)
library(stringr)
library(tidyr)
library(tidytext)
library(tidyverse)
library(tokenizers)
library(tm)
library(wordcloud)

In [None]:
sessionInfo()

### 1. Data Pre-processing

In [None]:
# file path for english data
blogs_path <- "~/Soft/Rtest/JHU_capstone_project_data/final/en_US/en_US.blogs.txt"
news_path <- "~/Soft/Rtest/JHU_capstone_project_data/final/en_US/en_US.news.txt"
twitter_path <- "~/Soft/Rtest/JHU_capstone_project_data/final/en_US/en_US.twitter.txt"

# load data
blogs <- readLines(blogs_path, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(news_path, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(twitter_path, encoding = "UTF-8", skipNul = TRUE)

# Read the data files into data frames
blogs <- data.frame(text = blogs)
news <- data.frame(text = news)
twitter <- data.frame(text = twitter)

In [None]:
# Sampling
set.seed(1565)
sample_pct <- 0.1

blogs_sample <- blogs %>% sample_n(., nrow(blogs)*sample_pct)
news_sample <- news %>% sample_n(., nrow(news)*sample_pct)
twitter_sample <- twitter %>% sample_n(., nrow(twitter)*sample_pct)

# Create aggregate sample
agg_sample <- bind_rows(
    mutate(blogs_sample, source = 'blogs')
    , mutate(news_sample, source = 'news')
    , mutate(twitter_sample, source = 'twitter')
)

# change agg_sample$source type: chr --> factor
agg_sample$source <- factor(agg_sample$source)

In [None]:
# Clean up to save RAM Space
rm(blogs
   , blogs_path
   , blogs_sample
   , news
   , news_path
   , news_sample
   , twitter
   , twitter_path
   , twitter_sample 
  )

In [None]:
# Create filters: non-alphanumeric's, url's, repeated letters
replace_reg <- "[^[:alpha:][:space:]]*"
replace_url <- "http[^[:space:]]*"
replace_aaa <- "\\b(?=\\w*(\\w)\\1)\\w+\\b"  

# Filter sample
clean_sample <- agg_sample %>%
    mutate(text = str_replace_all(text, replace_reg, "")) %>%
    mutate(text = str_replace_all(text, replace_url, "")) %>%
    mutate(text = str_replace_all(text, replace_aaa, "")) #%>%
#    mutate(text = iconv(text, "ASCII//TRANSLIT")

rm("agg_sample")

### 2. Build N-grams

In [None]:
# Bigram (2-gram)
bigram_data <- as.data.frame(clean_sample) %>% 
    unnest_tokens(output = bigram, input = text, token = 'ngrams', n = 2)

# Trigram (3-gram)
trigram_data <- as.data.frame(clean_sample) %>%
    unnest_tokens(output = trigram, input = text, token = 'ngrams', n = 3)

# Quadgram (4-gram)
quadgram_data <- as.data.frame(clean_sample) %>%
    unnest_tokens(output = quadgram, input = text, token = 'ngrams', n = 4)

# Quintgram (5-gram)
quintgram_data <- as.data.frame(clean_sample) %>%
    unnest_tokens(output = quintgram, input = text, token = 'ngrams', n = 5)

# Sextgram (6-gram)
sextgram_data <- as.data.frame(clean_sample) %>%
    unnest_tokens(output = sextgram, input = text, token = 'ngrams', n = 6)

Reduce N-grams for performance

In [None]:
# Bigram (2-gram)
bigram_cover <- bigram_data %>%
    count(c("bigram")) %>%
    filter(freq > 10) %>%
    arrange(desc(freq))

# Trigram (3-gram)
trigram_cover <- trigram_data %>%
    count(c("trigram")) %>%
    filter(freq > 10) %>%
    arrange(desc(freq))

# Quadgram (4-gram)
quadgram_cover <- quadgram_data %>%
    count(c("quadgram")) %>%
    filter(freq > 10) %>%
    arrange(desc(freq))

# Quintgram (5-gram)
quintgram_cover <- quintgram_data %>%
    count(c("quintgram")) %>%
    filter(freq > 10) %>%
    arrange(desc(freq))

# sextgram (5-gram)
sextgram_cover <- sextgram_data %>%
    count(c("sextgram")) %>%
    filter(freq > 10) %>%
    arrange(desc(freq))

In [None]:
rm(list = c("bigram_data", "trigram_data", "quadgram_data", "quintgram_data", "sextgram_data"))

### 3. Distributions
#### 3.1 Overview

In [None]:
dist_df <- data.frame(ngram = c(rep("bigram", nrow(bigram_cover))
                               , rep("trigram", nrow(trigram_cover))
                               , rep("quadgram", nrow(quadgram_cover))
                               , rep("quintgram", nrow(quintgram_cover))
                                ,rep("sextgram", nrow(sextgram_cover))
                               )
                     , freq = c(bigram_cover$freq
                               , trigram_cover$freq
                                ,quadgram_cover$freq
                                ,quintgram_cover$freq
                                ,sextgram_cover$freq
                               )
                     )
dist_df$ngram <- as.factor(dist_df$ngram)

In [None]:
# dim(dist_df) # 147931 x 2
# head(dist_df, 20)
str(dist_df)

In [None]:
g <- NULL
g <- ggplot(data = dist_df
           , aes(y = freq, x = reorder(ngram, -freq)))
g <- g + geom_boxplot()
g <- g + scale_y_log10()
g <- g + ggtitle("Distribution of N-grams")
g <- g + xlab("N-grams") + ylab("Frequencies (log10)")
g
ggsave("./data/ngrams_overview.png")

#### 3.2 Bigram

In [None]:
bi_word <- bigram_cover %>%
    separate(bigram, c("w1", "w2"), sep=" ")
# dim(bi_word) # 86153 x 3
head(bi_word)
saveRDS(bi_word, "./data/bi_words_separated.RDS")

In [None]:
bigram_cover %>%
    top_n(20, freq) %>%
    mutate(bigram = reorder(bigram, freq)) %>%
    ggplot(aes(y = bigram, x = freq)) +
    geom_col(fill = "darkgray") +
    xlab("Frequency") + ylab("Bigram") + ggtitle("Top 20 Frequencies of Bigram") #+
#    coord_flip()
ggsave("./data/ngrams_bigram.png")

#### 3.3 Trigram

In [None]:
tri_word <- trigram_cover %>%
    separate(trigram, c("w1", "w2", "w3"), sep=" ")
# dim(tri_word) # 50707 x 4
head(tri_word)
saveRDS(tri_word, "./data/tri_words_separated.RDS")

In [None]:
trigram_cover %>%
    top_n(20, freq) %>%
    mutate(trigram = reorder(trigram, freq)) %>%
    ggplot(aes(y = trigram, x = freq)) +
    geom_col(fill = "darkgray") +
    xlab("Frequency") + ylab("Trigram") + ggtitle("Top 20 Frequencies of Trigram") #+
#    coord_flip()
ggsave("./data/ngrams_trigram.png")

#### 3.4 Quadgram

In [None]:
quad_word <- quadgram_cover %>%
    separate(quadgram, c("w1", "w2", "w3", "w4"), sep=" ")
# dim(quad_word) # 9626 x 5
head(quad_word)
saveRDS(quad_word, "./data/quad_words_separated.RDS")

In [None]:
quadgram_cover %>%
    top_n(20, freq) %>%
    mutate(quadgram = reorder(quadgram, freq)) %>%
    ggplot(aes(y = quadgram, x = freq)) +
    geom_col(fill = "darkgray") +
    xlab("Frequency") + ylab("Quadgram") + ggtitle("Top 20 Frequencies of Quadgram") #+
#    coord_flip()
ggsave("./data/ngrams_quadgram.png")

#### 3.5 Quintgram

In [None]:
quint_word <- quintgram_cover %>%
    separate(quintgram, c("w1", "w2", "w3", "w4", "w5"), sep=" ")
# dim(quint_word) # 1242 x 6
head(quint_word)
saveRDS(quint_word, "./data/quint_words_separated.RDS")

In [None]:
quintgram_cover %>%
    top_n(20, freq) %>%
    mutate(quintgram = reorder(quintgram, freq)) %>%
    ggplot(aes(y = quintgram, x = freq)) +
    geom_col(fill = "darkgray") +
    xlab("Frequency") + ylab("Quintgram") + ggtitle("Top 20 Frequencies of Quintgram") #+
#    coord_flip()
ggsave("./data/ngrams_quintgram.png")

#### 3.6 Sextgram

In [None]:
sext_word <- sextgram_cover %>%
    separate(sextgram, c("w1", "w2", "w3", "w4", "w5", "w6"), sep=" ")
# dim(sext_word) # 203 x 7
head(sext_word)
saveRDS(sext_word, "./data/sext_words_separated.RDS")

In [None]:
sextgram_cover %>%
    top_n(20, freq) %>%
    mutate(sextgram = reorder(sextgram, freq)) %>%
    ggplot(aes(y = sextgram, x = freq)) +
    geom_col(fill = "darkgray") +
    xlab("Frequency") + ylab("sextgram") + ggtitle("Top 20 Frequencies of Sextgram") #+
#    coord_flip()
ggsave("./data/ngrams_sextgram.png")

### 4. Time

In [None]:
time_start <- Sys.time()
bi_word <- readRDS("./data/bi_words_separated.RDS")
tri_word <- readRDS("./data/tri_words_separated.RDS")
quad_word <- readRDS("./data/quad_words_separated.RDS")
quint_word <- readRDS("./data/quint_words_separated.RDS")
sext_word <- readRDS("./data/sext_words_separated.RDS")
time_stop <- Sys.time()
duration <- time_stop - time_start
duration

## Task 5: Prediction Model


### 0. Environment Settings

In [None]:
library(data.tree)
library(DiagrammeR)
library(dplyr)
library(ggplot2)
library(igraph)
library(influenceR)
library(plyr)
library(RColorBrewer)
library(SnowballC)
library(stopwords)
library(stringi)
library(stringr)
library(tidyr)
library(tidytext)
library(tidyverse)
library(tokenizers)
library(tm)
library(wordcloud)

In [None]:
sessionInfo()

### 1. Load data

In [None]:
time_start <- Sys.time()
bi_word <- readRDS("./data/bi_words_separated.RDS")
tri_word <- readRDS("./data/tri_words_separated.RDS")
quad_word <- readRDS("./data/quad_words_separated.RDS")
quint_word <- readRDS("./data/quint_words_separated.RDS")
sext_word <- readRDS("./data/sext_words_separated.RDS")
time_stop <- Sys.time()
duration <- time_stop - time_start
duration

In [None]:
head(bi_word)

### 2. Matching functions

In [None]:
# Bigram prediction function
bigram <- function(inputWord){
    num <- length(inputWord)
    filter(bi_word, w1 == inputWord[num]) %>%
    top_n (1, freq) %>%
    filter(row_number() == 1L) %>%
    select(num_range("w", 2)) %>%
    as.character() -> out
    ifelse(out == "character(0)", "?", return(out))
}

# Trigram prediction function
trigram <- function(inputWord)
    {
    num <- length(inputWord)
    filter(tri_word, 
           w1 == inputWord[num-1]
          ,w2 == inputWord[num]) %>%
    top_n (1, freq) %>%
    filter(row_number() == 1L) %>%
    select(num_range("w", 3)) %>%
    as.character() -> out
    ifelse(out == "character(0)", "?", return(out))
}

# Quadgram prediction function
quadgram <- function(inputWord)
    {
    num <- length(inputWord)
    filter(quad_word, 
           w1 == inputWord[num-2]
          ,w2 == inputWord[num-1]
          ,w3 == inputWord[num]) %>%
    top_n (1, freq) %>%
    filter(row_number() == 1L) %>%
    select(num_range("w", 4)) %>%
    as.character() -> out
    ifelse(out == "character(0)", "?", return(out))
}

# Quintgram prediction function
quintgram <- function(inputWord)
    {
    num <- length(inputWord)
    filter(quint_word, 
           w1 == inputWord[num-3]
          ,w2 == inputWord[num-2]
          ,w3 == inputWord[num-1]
          ,w4 == inputWord[num]) %>%
    top_n (1, freq) %>%
    filter(row_number() == 1L) %>%
    select(num_range("w", 5)) %>%
    as.character() -> out
    ifelse(out == "character(0)", "?", return(out))
}

# Sextgram prediction function
sextgram <- function(inputWord)
    {
    num <- length(inputWord)
    filter(sext_word, 
           w1 == inputWord[num-4]
          ,w2 == inputWord[num-3]
          ,w3 == inputWord[num-2]
          ,w4 == inputWord[num-1]
          ,w5 == inputWord[num]) %>%
    top_n (1, freq) %>%
    filter(row_number() == 1L) %>%
    select(num_range("w", 6)) %>%
    as.character() -> out
    ifelse(out == "character(0)", "?", return(out))
}

In [None]:
predictNgrams <- function(input){
    # create dataframe
    input <- data_frame(text = input)
    # clean the input text
    replace_reg <- "[^[:alpha:][:space:]]*"
    input <- input %>%
        mutate(text = str_replace_all(text, replace_reg, ""))
    # find word count, separate words, lower case
    input_count <- str_count(input, boundary("word"))
    input_words <- tolower(unlist(str_split(input, boundary("word"))))
    # call the matching functions
    out <- if (input_count == 1) {bigram(input_words)}
            else if (input_count == 2) {trigram(input_words)}
            else if (input_count == 3) {quadgram(input_words)}
            else if (input_count == 4) {quintgram(input_words)}
            else if (input_count == 5) {sextgram(input_words)}
    return(out)
}

In [None]:
predictNgrams("case of")