/
input_validation.R
202 lines (178 loc) · 5.95 KB
/
input_validation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#'
#'Constants for the package
#' @field max_input_data_size max # of rows in the data frame, to control the response time
#'
config_params = list(
max_input_data_size = 50000,
default_seed = 314159,
col_names = c("id", "polarity", "user", "text", "date")
)
#' check if a data frame is valid sentiment data set
#' @description Method to check if a data frame is valid sentiment data set
#'
#' @param data data frame to check
#' @return bool true or false
#'
is_valid_sentiment_data_frame <- function (data) {
if (!is.data.frame(data) || nrow(data) == 0) {
return(FALSE)
}
if ( all(config_params$col_names %in% colnames(data) )) {
return(TRUE)
}
return(FALSE)
}
#' Method to validate a data frame
#'
#' @description Method to validate a data frame
#'
#' @param data input data
#' @return Execution halts with error if invalid format found.
#'
validate_sentiment_data_frame <- function(data) {
if (!is_valid_sentiment_data_frame(data)) {
stop('validate_sentiment_data_frame: invalid data frame')
}
}
#' Method to validate a list
#'
#' @description Method to validate a list
#'
#' @param input_list input list
#' @param message message text to report in case of failure (optional)
#' @return Execution halts with error if invalid format found.
#'
validate_list <- function(input_list, message) {
if (!missing(input_list) && !is.vector(input_list)) {
if (missing(message)) {
message = "Aborting, invalid given list "
}
error = paste("validate_list:",
message, "c(",
paste(input_list, collapse = ","),
" ).")
stop(error)
}
}
#' Method to validate input data
#'
#' @description Method to validate input data
#'
#' @param start_date_time input start_date_time in POSIXct format
#' @param end_date_time input end_date_time in POSIXct format
#' @return Execution halts with error if invalid format found.
#'
validate_time_range <- function(start_date_time, end_date_time) {
#both start_date_time and end_date_time should be specified.
if ((missing(start_date_time) &&
!missing(end_date_time)) ||
(!missing(start_date_time) &&
missing(end_date_time))) {
stop("validate_time_range: Aborting because both start_date_time and end_date_time are required.")
}
# make sure the date/time data type is correct
if ((!missing(start_date_time) &&
!lubridate::is.POSIXct(start_date_time)) ||
(!missing(end_date_time) &&
!lubridate::is.POSIXct(end_date_time))) {
stop("validate_time_range: Aborting because start/end date time is NOT of POSIXct type")
}
}
#' Make clause for given parameters
#'
#' @description Make clause for given parameters
#'
#' @param user_list a vector of users for which to filter the dataset.
#' @param start_date_time input start_date_time in POSIXct format on which to filter the dataset
#' @param end_date_time input end_date_time in POSIXct format on which to filter the dataset
#' @param keyword_list a list of string keywords on which to filter the dataset
#' @return filter clause string
#'
#' @importFrom stringr str_c
#'
#' @keywords validation
#'
make_clause <- function(user_list, start_date_time, end_date_time, keyword_list) {
users_condition <- get_user_list_condition(user_list = user_list)
time_condition <- get_time_condition(start_date_time = start_date_time,
end_date_time = end_date_time)
keyword_condition <- get_keyword_condition(keyword_list = keyword_list)
clause <- stringr::str_c(users_condition, " & ", time_condition, " & ", keyword_condition)
return(clause)
}
#' Make sub-clause for given keywords
#'
#' @description Make sub-clause for given keywords
#'
#' @param keyword_list a list of string keywords on which to filter the dataset
#' @return filter sub-clause string
#'
#' @importFrom stringr str_c
#'
#' @keywords query
#'
get_keyword_condition <- function(keyword_list) {
keyword_condition <- "TRUE"
if(!missing(keyword_list) && !all(is.na(keyword_list))){
keyword_condition <- stringr::str_c("( stringr::str_detect(text, pattern = \"",
paste(keyword_list, collapse = "|"), "\" ))")
}
return(keyword_condition)
}
#' Make sub-clause for given time range
#'
#' @description Make sub-clause for given time range
#'
#' @param start_date_time input start_date_time in POSIXct format on which to filter the dataset
#' @param end_date_time input end_date_time in POSIXct format on which to filter the dataset
#' @return filter sub-clause string
#'
#' @importFrom stringr str_c
#'
#' @keywords query
#'
get_time_condition <- function(start_date_time, end_date_time) {
time_condition <- "TRUE"
if (!missing(start_date_time) && !missing(end_date_time)){
time_condition <- stringr::str_c("( date >= \"", start_date_time, "\" & date <= \"", end_date_time, "\" )")
}
return(time_condition)
}
#' Make sub-clause for given users
#'
#' @description Make sub-clause for given users
#'
#' @param user_list a vector of users for which to filter the dataset.
#' @return filter sub-clause string
#'
#' @importFrom stringr str_c
#'
#' @keywords query
#'
get_user_list_condition <- function(user_list) {
users_condition <- "TRUE"
if (!missing(user_list) && !all(is.na(user_list))){
users_condition <- stringr::str_c("( user %in% c( ")
for (user in user_list) {
users_condition <- stringr::str_c(users_condition, "\"", user, "\",")
}
users_condition <- substr(users_condition, 1, nchar(users_condition)-1)
users_condition <- stringr::str_c(users_condition, " ))")
}
return(users_condition)
}
#'
#' subsample input data to max size
#' @param data input data
#' @param data_size defaults to the max input data size
#' @return sub sampled data if needed.
#'
subsample_input_data <- function(data, data_size=config_params$max_input_data_size) {
validate_sentiment_data_frame(data)
set.seed(config_params$default_seed)
if (dim(data)[1] > data_size) {
data <- data %>%
dplyr::sample_n(data_size)
}
return(data)
}