## Task 5: Creative Exploration
So far you have used basic models to understand and predict words. In this next task, your goal is to use all the resources you have available to you (from the Data Science Specialization, resources on the web, or your own creativity) to improve the predictive accuracy while reducing computational runtime and model complexity (if you can). Be sure to hold out a test set to evaluate the new, more creative models you are building.

**Tasks to accomplish**
- Explore new models and data to improve your predictive model.
- Evaluate your new predictions on both accuracy and efficiency. 

**Questions to consider**
- What are some alternative data sets you could consider using?
- What are ways in which the n-gram model may be inefficient?
- What are the most commonly missed n-grams? Can you think of a reason why they would be missed and fix that?
- What are some other things that other people have tried to improve their model?
- Can you estimate how uncertain you are about the words you are predicting? 

### 0. Environment Settings

In [None]:
library(data.tree)
library(DiagrammeR)
library(dplyr)
library(ggplot2)
library(igraph)
library(influenceR)
library(plyr)
library(RColorBrewer)
library(SnowballC)
library(stopwords)
library(stringi)
library(stringr)
library(tidyr)
library(tidytext)
library(tidyverse)
library(tokenizers)
library(tm)
library(wordcloud)

In [162]:
sessionInfo()

R version 4.0.2 (2020-06-22)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.1 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=zh_CN.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=zh_CN.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=zh_CN.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=zh_CN.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_0.5.0      purrr_0.3.4        readr_1.3.1        tibble_3.0.3      
 [5] tidyverse_1.3.0    wordcloud_2.6      tm_0.7-7           NLP_0.2-0         
 [9] tokenizers_0.2.1   tidytext_0.2.5     tidyr_1.1.2        stringr_1.4.0     
[13] stringi_1.4.6      s

### 1. Load data

In [79]:
time_start <- Sys.time()
bi_word <- readRDS("./data/bi_words_separated.RDS")
tri_word <- readRDS("./data/tri_words_separated.RDS")
quad_word <- readRDS("./data/quad_words_separated.RDS")
quint_word <- readRDS("./data/quint_words_separated.RDS")
sext_word <- readRDS("./data/sext_words_separated.RDS")
time_stop <- Sys.time()
duration <- time_stop - time_start
duration

Time difference of 0.08728886 secs

In [80]:
head(bi_word)

Unnamed: 0_level_0,w1,w2,freq
Unnamed: 0_level_1,<chr>,<chr>,<int>
1,of,the,44040
2,in,the,41732
3,to,the,25093
4,for,the,20872
5,on,the,20043
6,to,be,16518


### 2. Matching functions

In [148]:
# Bigram prediction function
bigram <- function(inputWord){
    num <- length(inputWord)
    filter(bi_word, w1 == inputWord[num]) %>%
    top_n (1, freq) %>%
    filter(row_number() == 1L) %>%
    select(num_range("w", 2)) %>%
    as.character() -> out
    ifelse(out == "character(0)", "?", return(out))
}

# Trigram prediction function
trigram <- function(inputWord)
    {
    num <- length(inputWord)
    filter(tri_word, 
           w1 == inputWord[num-1]
          ,w2 == inputWord[num]) %>%
    top_n (1, freq) %>%
    filter(row_number() == 1L) %>%
    select(num_range("w", 3)) %>%
    as.character() -> out
    ifelse(out == "character(0)", "?", return(out))
}

# Quadgram prediction function
quadgram <- function(inputWord)
    {
    num <- length(inputWord)
    filter(quad_word, 
           w1 == inputWord[num-2]
          ,w2 == inputWord[num-1]
          ,w3 == inputWord[num]) %>%
    top_n (1, freq) %>%
    filter(row_number() == 1L) %>%
    select(num_range("w", 4)) %>%
    as.character() -> out
    ifelse(out == "character(0)", "?", return(out))
}

# Quintgram prediction function
quintgram <- function(inputWord)
    {
    num <- length(inputWord)
    filter(quint_word, 
           w1 == inputWord[num-3]
          ,w2 == inputWord[num-2]
          ,w3 == inputWord[num-1]
          ,w4 == inputWord[num]) %>%
    top_n (1, freq) %>%
    filter(row_number() == 1L) %>%
    select(num_range("w", 5)) %>%
    as.character() -> out
    ifelse(out == "character(0)", "?", return(out))
}

# Sextgram prediction function
sextgram <- function(inputWord)
    {
    num <- length(inputWord)
    filter(sext_word, 
           w1 == inputWord[num-4]
          ,w2 == inputWord[num-3]
          ,w3 == inputWord[num-2]
          ,w4 == inputWord[num-1]
          ,w5 == inputWord[num]) %>%
    top_n (1, freq) %>%
    filter(row_number() == 1L) %>%
    select(num_range("w", 6)) %>%
    as.character() -> out
    ifelse(out == "character(0)", "?", return(out))
}

In [149]:
predictNgrams <- function(input){
    # create dataframe
    input <- data_frame(text = input)
    # clean the input text
    replace_reg <- "[^[:alpha:][:space:]]*"
    input <- input %>%
        mutate(text = str_replace_all(text, replace_reg, ""))
    # find word count, separate words, lower case
    input_count <- str_count(input, boundary("word"))
    input_words <- tolower(unlist(str_split(input, boundary("word"))))
    # call the matching functions
    out <- if (input_count == 1) {bigram(input_words)}
            else if (input_count == 2) {trigram(input_words)}
            else if (input_count == 3) {quadgram(input_words)}
            else if (input_count == 4) {quintgram(input_words)}
            else if (input_count == 5) {sextgram(input_words)}
    return(out)
}

In [165]:
predictNgrams("case of")