-
Notifications
You must be signed in to change notification settings - Fork 0
/
redcap_data.R
313 lines (220 loc) · 12.4 KB
/
redcap_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
#' Read REDCap data
#'
#' @description
#' This function allows users to read datasets from a REDCap project into R for analysis, either via export of the data or via an API connection.
#'
#' The REDCap API is an interface that allows communication with REDCap and server without going through the interactive REDCap interface.
#'
#' @note If you will give further use to the package, we advise you to use the argument 'dic_path' to read your dictionary, as all other functions need it in order to run properly.
#'
#' @param data_path Character string with the pathname of the R file to read the dataset from.
#' @param dic_path Character string with the pathname of the dictionary.
#' @param event_path Character string with the pathname of the file containing the correspondence between each event and each form (it can be downloaded through the `Designate Instruments for My Events` tab inside the `Project Setup` section of REDCap)
#' @param uri The URI (Uniform Resource Identification) of the REDCap project.
#' @param token Character vector with the generated token.
#' @param filter_field Character vector with the fields of the REDCap project desired to import into R (API connection only)<.
#' @return List containing the dataset and the dictionary of the REDCap project. If the event_path is specified, it will also contain a third element with the correspondence of the events & forms of the project.
#'
#' @note To read exported data, you must first use REDCap's 'Export Data' function and select the 'R Statistical Software' format. It will then generate a CSV file with all the observations and an R file with the necessary code to complete each variable's information.
#'
#' @export
redcap_data<-function(data_path = NA, dic_path = NA, event_path = NA, uri = NA, token = NA, filter_field = NA)
{
oldwd <- getwd()
on.exit(setwd(oldwd))
# Warning: data_path, dic_path and another argument are specified.
if(all(!c(data_path, dic_path) %in% NA) & any(!c(token, uri) %in% NA)){
stop("Too many arguments, if you want to read exported data from REDCap use only the arguments data_path and dic_path", call. = FALSE)
}
# Warning: token, uri and another argument are specified.
if(all(!c(token, uri) %in% NA) & any(!c(data_path, dic_path) %in% NA)){
stop("Too many arguments, if you want to read data from REDCap through an API connection use only the arguments uri and token.", call. = FALSE)
}
# Read data, dictionary and event-form mapping in case of exported data.
if(all(!c(data_path, dic_path) %in% NA) & all(c(token, uri) %in% NA)){
# Read data
tmp_env <- new.env()
file.lines <- scan(data_path, what = character(), skip = 2, sep = '\n', quiet = TRUE)
file.lines.collapsed <- paste(file.lines, collapse = '\n')
command <- paste0("dirname(parent.frame(2)$", "data_path", ")")
setwd(eval(parse(text = command)))
source(textConnection(file.lines.collapsed), local = tmp_env, encoding = "UTF-8")
data <- get("data", envir = tmp_env)
if (names(data)[1]!="record_id") {
names(data)[1] <- "record_id"
}
# Read dictionary
setwd(oldwd)
# Evaluate the extension
extension <- tools::file_ext(dic_path)
if (extension == "xlsx") {
# Read XLSX file
dic <- openxlsx::read.xlsx(dic_path, colNames = F, detectDates = T)
} else if (extension == "csv") {
# Read CSV file
dic <- utils::read.csv(dic_path, encoding = "UTF-8", header = FALSE)
} else {
stop("Unsupported file format. Only XLSX and CSV are supported.")
}
# Changing names of the first column and first observation
names(dic) <- dic[1,]
dic <- dic[-1,]
names(dic) <- janitor::make_clean_names(names(dic))
names(dic)[1] <- "field_name"
if (dic[1,1]!="record_id") {
dic[1,1] <- "record_id"
}
# Remove descriptive variables from dictionary
if ("descriptive" %in% dic$field_type) {
dic <- dic %>% dplyr::filter(!.data$field_type %in% "descriptive")
}
# Indicator of longitudinal projects
longitudinal <- ifelse("redcap_event_name" %in% names(data), TRUE, FALSE)
#Read event file
if(!is.na(event_path)){
setwd(oldwd)
# Evaluate the extension
extension <- tools::file_ext(event_path)
if (extension == "xlsx") {
# Read XLSX file
event_form <- openxlsx::read.xlsx(event_path, detectDates = T)
} else if (extension == "csv") {
# Read CSV file
event_form <- utils::read.csv(event_path, encoding = "UTF-8")
} else {
stop("Unsupported file format. Only XLSX and CSV are supported.")
}
data_def <- list(data = data, dictionary = dic, event_form = event_form)
}else{
#If no event is specified and the project is longitudinal
if(longitudinal){
warning("The project contains more than one event. You might want to load the event-form correspondence using the argument event_path.")
}
data_def <- list(data = data, dictionary = dic)
}
}
# Read data, dictionary and event-form mapping in case of an API connection.
if(all(!c(token, uri) %in% NA) & all(c(data_path, dic_path) %in% NA)){
# Message
message("Importing in progress...")
# First read the labels
if (all(filter_field %in% NA)) {
labels <- suppressMessages(REDCapR::redcap_read(redcap_uri = uri, token = token, verbose = FALSE, raw_or_label = "label", raw_or_label_headers = "label", export_data_access_groups = TRUE)$data)
} else {
labels <- suppressMessages(REDCapR::redcap_read(redcap_uri = uri, token = token, verbose = FALSE, raw_or_label = "label", raw_or_label_headers = "label", export_data_access_groups = TRUE, fields = filter_field)$data)
}
# Save the factor version of the default variables of redcap
redcap_names <- names(labels %>%
dplyr::select(dplyr::any_of(c("Event Name", "Repeat Instrument", "Data Access Group"))))
default_names <- data.frame(fac = redcap_names) %>%
dplyr::mutate(corres = dplyr::case_when(fac %in% "Event Name" ~ "redcap_event_name.factor",
fac %in% "Repeat Instrument" ~ "redcap_repeat_instrument.factor",
fac %in% "Data Access Group" ~ "redcap_data_access_group.factor"))
rename_redcap <- default_names$fac
names(rename_redcap) <- default_names$corres
main_vars <- labels %>%
dplyr::mutate_at(redcap_names[!redcap_names %in% "Repeat Instrument"], ~forcats::fct_inorder(.)) %>%
dplyr::rename("record_id" = "Record ID",
dplyr::all_of(rename_redcap)) %>%
dplyr::select("record_id", default_names$corres)
# Remove the "...number" suffixes from the labels
labels <- gsub("\\.{3}\\d+$", "", names(labels))
# Message
message("Almost done...")
# Read data using the API connection
if (all(filter_field %in% NA)) {
data_api <- REDCapR::redcap_read_oneshot(redcap_uri = uri, token = token, verbose = FALSE, raw_or_label = "raw", export_data_access_groups = TRUE)$data
} else {
data_api <- REDCapR::redcap_read_oneshot(redcap_uri = uri, token = token, verbose = FALSE, raw_or_label = "raw", export_data_access_groups = TRUE, fields = filter_field)$data
}
if (nrow(data_api) > 0) {
names(data_api)[1] <- "record_id"
} else {
stop("No observational data is available for reading. Please ensure that you add records to your REDCap project.", call. = F)
}
# Read dictionary using the API connection
dic_api <- REDCapR::redcap_metadata_read(redcap_uri = uri, token = token, verbose = FALSE)$data
## Making sure the names of both dictionaries(exported data and API connection) match
names(dic_api)[names(dic_api) %in% c("select_choices_or_calculations", "branching_logic", "question_number")] <- c("choices_calculations_or_slider_labels", "branching_logic_show_field_only_if", "question_number_surveys_only")
# Apply labels
data_api <- as.data.frame(purrr::map2(data_api, labels, ~labelled::set_variable_labels(.x, .y, .strict = FALSE)))
# Remove descriptive variables from dictionary
if ("descriptive" %in% dic_api$field_type) {
dic_api <- dic_api %>% dplyr::filter(!.data$field_type %in% "descriptive")
}
# If filter_field is described, filter the variables in the dictionary
if (!all(filter_field %in% NA)) {
dic_api <- dic_api %>% dplyr::filter(.data$field_name %in% filter_field)
}
# Identify checkboxes fields and convert them to factor using the dictionary as guide
if (sum(dic_api$field_type %in% "checkbox") > 0) {
var_check <- names(data_api)[grep("___", names(data_api))]
data_api <- data_api %>%
dplyr::mutate(dplyr::across(dplyr::all_of(var_check), ~ factor(., levels = c("0", "1"), labels = c("Unchecked", "Checked")), .names = "{col}.factor"))
}
# Identify radio buttons and dropdown fields and convert them to factor using the dictionary as guide
if (sum(dic_api$field_type %in% c("radio", "dropdown")) > 0) {
var_radio <- dic_api %>%
dplyr::filter(.data$field_type %in% c("radio", "dropdown")) %>%
dplyr::select("field_name", "field_type", "choices_calculations_or_slider_labels") %>%
dplyr::mutate(labels = paste0(gsub("^\\d+, ", "'", gsub("\\| ?\\d+, ?", "', '", .data$choices_calculations_or_slider_labels)), "'"),
levels = c(stringr::str_extract_all(.data$choices_calculations_or_slider_labels, "(\\d+),"))) %>%
dplyr::rowwise() %>%
dplyr::mutate(dplyr::across(levels, ~ gsub(", ?$", "", gsub(",, ", ", ", toString(unique(unlist(.))))))) %>%
dplyr::ungroup()
for (i in var_radio$field_name) {
eval(parse(text = paste0("data_api <- data_api %>%
dplyr::mutate(dplyr::across(dplyr::all_of(i), ~factor(.,
levels = ", parse(text = paste0("c(", var_radio[var_radio$field_name %in% i, "levels"] %>% as.character, ")")),",
labels = ", parse(text = paste0("c(", var_radio[var_radio$field_name %in% i, "labels"] %>% as.character, ")")), '),
.names = "{col}.factor"))')))
}
}
# Join the main_vars to the imported data
data_api <- data_api %>%
dplyr::bind_cols(main_vars %>% dplyr::select(-"record_id"))
# Indicator of longitudinal projects
longitudinal <- ifelse("redcap_event_name" %in% names(data_api), TRUE, FALSE)
# Read event file
if(!is.na(event_path)){
# Warning: event_path not necessary while using API connection
warning("The event_path argument is not necessary as the event-form correspondence can be automatically read with the API connection")
setwd(oldwd)
# Evaluate the extension
extension <- tools::file_ext(event_path)
if (extension == "xlsx") {
# Read XLSX file
event_form <- openxlsx::read.xlsx(event_path, detectDates = T)
} else if (extension == "csv") {
# Read CSV file
event_form <- utils::read.csv(event_path, encoding = "UTF-8")
} else {
stop("Unsupported file format. Only XLSX and CSV are supported.")
}
data_def <- list(data = data_api,
dictionary = dic_api,
event_form = event_form)
} else {
# If the event file is not specified, the function reads it using the API connection (in case of longitudinal projects)
if(longitudinal){
event_form <- as.data.frame(REDCapR::redcap_event_instruments(redcap_uri = uri, token = token, verbose = FALSE)$data)
data_def <- list(data = data_api[, !(grepl("_complete", names(data_api)))],
dictionary = dic_api,
event_form = event_form)
} else {
data_def <- list(data = data_api,
dictionary = dic_api)
}
}
# Message
message("Done!")
}
# Specifying the "UTF-8" encoding to each character column of the data
for (i in 1:length(data_def$data)) {
if(is.character(data_def$data[, i])){
suppressWarnings(data_def$data[, i] <- stringr::str_conv(data_def$data[, i], "UTF-8"))
}
}
# Output
return(data_def)
}