/
predict_polarity_keras.R
156 lines (128 loc) · 6.26 KB
/
predict_polarity_keras.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#' Predict polarity for the given dataset.
#'
#' @name predict_polarity_keras
#'
#' @description Predict polarity for the given dataset and filters using the pre-trained Keras LSTM model.
#'
#' @param data the sentiment140 test \code{text} for text of the tweet.
#' @param user_list a vector of users for which to filter the dataset
#' @param start_date_time input start_date_time in POSIXct format on which to filter the dataset
#' @param end_date_time input end_date_time in POSIXct format on which to filter the dataset
#' @param keyword_list a list of string keywords on which to filter the dataset
#' @param maxlen Maximum length of a sequence.
#' @param max_words Maximum number of words to consider using word frequency measure.
#' @param trained_data Training dataset.
#' @param model_load_path File path location of the trained model file.
#' @return a list object with \code{raw} filtered dataframe, \code{predictions} dataframe that holds the predicted polarity using the model, \code{confusion_matrix}, multiple model performance statistics and a \code{plot} comparing the actual and predicted polarity.
#'
#' @export
#'
#' @importFrom ggplot2 ggplot geom_tile geom_text aes scale_fill_gradient theme_bw theme labs
#' @importFrom magrittr %>%
#' @importFrom dplyr filter count mutate pull
#' @importFrom rlang .data
#'
#' @keywords modelling visualization keras
#'
#' @examples
#' library(deepSentimentR)
#' predict_polarity_keras()
#'
utils::globalVariables(c("sentiment140_test"))
predict_polarity_keras <- function(data = sentiment140_test,
user_list,
start_date_time,
end_date_time,
keyword_list,
maxlen = keras_config_params$maxlen,
max_words = keras_config_params$max_words,
trained_data = sentiment140_train,
model_load_path = system.file("extdata",
"train_no_glove_lstm.rds",
package = "deepSentimentR",
mustWork = TRUE)) {
validate_sentiment_data_frame(data = data)
validate_list(input_list = user_list, message="Aborting, invalid user list")
validate_time_range(start_date_time = start_date_time,
end_date_time = end_date_time)
validate_list(input_list = keyword_list, message="Aborting, invalid keyword list")
clause <- make_clause(user_list, start_date_time, end_date_time, keyword_list)
exp <- eval(parse(text = clause), data, parent.frame())
raw <- data %>% dplyr::filter(exp)
raw <- subsample_input_data(raw)
result <- list()
result$raw <- raw
predictions <- get_predictions(data = raw,
maxlen = maxlen,
model_load_path = model_load_path,
trained_data = trained_data,
max_words = max_words)
result$predictions <- predictions
predictions %>%
dplyr::count(.data$polarity, .data$pred_polarity, name = "count") %>%
dplyr::mutate(polarity = as.factor(.data$polarity),
pred_polarity = as.factor(.data$pred_polarity)) -> confusion
result$confusion_matrix <- confusion
result$true_negative <- confusion %>%
filter(.data$polarity == 0, .data$pred_polarity == 0) %>%
pull(.data$count)
result$true_positive <- confusion %>%
filter(.data$polarity == 1, .data$pred_polarity == 1) %>%
pull(.data$count)
result$false_positive <- confusion %>%
filter(.data$polarity==0, .data$pred_polarity == 1) %>%
pull(.data$count)
result$false_negative <- confusion %>%
filter(.data$polarity==1, .data$pred_polarity==0) %>%
pull(.data$count)
result$precision <- result$true_positive / (result$true_positive + result$false_positive)
result$recall <- result$true_positive / (result$true_positive + result$false_negative)
result$f1 <- 2 * result$precision * result$recall / (result$precision + result$recall)
result$accuracy <- (result$true_positive + result$true_negative) /
(result$true_positive + result$true_negative + result$false_positive + result$false_negative)
p <- confusion %>%
ggplot2::ggplot(mapping = ggplot2::aes(x = .data$polarity, y = .data$pred_polarity)) +
ggplot2::geom_tile(ggplot2::aes(fill = .data$count), colour = "white") +
ggplot2::geom_text(ggplot2::aes(label = sprintf("%1.0f", .data$count)), vjust = 1) +
ggplot2::scale_fill_gradient(low = "light blue", high = "royal blue") +
ggplot2::theme_bw() +
ggplot2::theme(legend.position = "none") +
ggplot2::labs(x = "Polarity",
y = "Predicted Polarity",
title = "Confusion Matrix for Predicted Polarity")
result$plot <- p
return(result)
}
#' Get predictions for the given subset of data
#'
#' @name get_predictions
#'
#' @description Run given keras model to make predictions on the given subset of data
#'
#' @param data Given subset of data to make predictions on.
#' @param maxlen Maximum length of a sequence.
#' @param model_load_path File path location of the trained model file.
#' @param trained_data Training dataset.
#' @param max_words Maximum number of words to consider using word frequency measure.
#' @return Given subset of data appended with pred_polarity column for predicted values.
#'
#' @importFrom magrittr %>%
#' @importFrom dplyr mutate
#' @importFrom rlang .data
#' @importFrom keras texts_to_sequences pad_sequences unserialize_model predict_classes
#'
get_predictions <- function(data, maxlen, model_load_path, trained_data, max_words) {
#Make sure polarity is 0/1
data %>%
dplyr::mutate(polarity = ifelse(.data$polarity=="Positive", 1, 0)) -> data
tokenizer <- get_tokenizer(data = trained_data,
max_words = max_words)
sequences <- keras::texts_to_sequences(tokenizer, data$text)
x_test <- keras::pad_sequences(sequences, maxlen = maxlen)
y_test <- as.array(data$polarity)
model <- keras::unserialize_model(readRDS(model_load_path))
model %>%
keras::predict_classes(x_test) -> preds
data %>%
mutate(pred_polarity = preds)
}