-
Notifications
You must be signed in to change notification settings - Fork 2
/
remove2NAchar.R
89 lines (81 loc) · 4.06 KB
/
remove2NAchar.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#### Shorten multiple text variables
#############################################################################
#' Shorten multiple text variables while giving NA codes.
#'
#' Shorten text variables from a certain number on while coding overflowing answers as complete missings.
#'
#' In some cases, multiple text variables contain the information of one variable (e.g. multiple answers to an open item).
#' If this is a case, sometimes the number text variables displaying this variable should be limited. \code{remove2NAchar}
#' allows shortening multiple character variables, this means character variables after \code{max_num} are removed
#' from the \code{GADSdat}. Cases, which had valid responses on these removed variables are coded as missings (using
#' \code{na_value} and \code{na_label}).
#'
#'@param GADSdat A \code{GADSdat} object.
#'@param vars A character vector with the names of the text variables.
#'@param max_num Maximum number of text variables. Additional text variables will be removed and NA codes given accordingly.
#'@param na_value Which NA value should be given in cases of too many values on text variables.
#'@param na_label Which value label should be given to the \code{na_value}.
#'
#'@return Returns the modified \code{GADSdat}.
#'
#'@examples
#'## create an example GADSdat
#'example_df <- data.frame(ID = 1:4,
#' citizenship1 = c("German", "English", "missing by design", "Chinese"),
#' citizenship2 = c(NA, "German", "missing by design", "Polish"),
#' citizenship3 = c(NA, NA, NA, "German"),
#' stringsAsFactors = FALSE)
#'gads <- import_DF(example_df)
#'
#'## shorten character variables
#'gads2 <- remove2NAchar(gads, vars = c("citizenship1", "citizenship2", "citizenship3"),
#' na_value = -99, na_label = "missing: too many answers")
#'
#'
#'@export
remove2NAchar <- function(GADSdat, vars, max_num = 2, na_value, na_label) {
UseMethod("remove2NAchar")
}
#'@export
remove2NAchar.GADSdat <- function(GADSdat, vars, max_num = 2, na_value, na_label) {
check_GADSdat(GADSdat)
if(!is.numeric(max_num) || length(max_num) != 1) stop("'max_num' needs to be a single numeric value greater than 0.")
if(!is.numeric(na_value) || length(na_value) != 1) stop("'na_value' needs to be a single numeric value.")
if(!is.character(na_label) || length(na_label) != 1) stop("'na_label' needs to be a single character value.")
suppressMessages(miniGADS <- extractVars(GADSdat, vars = vars))
suppressWarnings(mini_dat_ori <- extractData(miniGADS))
mini_dat <- max_num_strings2NA(mini_dat_ori, max_num = max_num, na_value = na_value)
# replace old variables (to maintain original column ordering)
dat <- GADSdat$dat
for(i in names(mini_dat)) {
dat[, i] <- mini_dat[, i]
}
# cut text variables
remove_vars <- vars[-(1:max_num)]
dat2 <- dat[, !names(dat) %in% remove_vars, drop = FALSE]
# restore specific missing codes in character variables
missing_values <- unique(GADSdat$labels[which(GADSdat$labels$varName %in% vars & GADSdat$labels$missings == "miss"), "value"])
for(i in vars[!vars %in% remove_vars]) {
dat2[, i] <- ifelse(GADSdat$dat[, i] %in% missing_values, yes = GADSdat$dat[, i], no = dat2[, i])
}
GADSdat_out <- updateMeta(GADSdat, dat2)
for(i in vars[!vars %in% remove_vars]) {
GADSdat_out <- changeValLabels(GADSdat_out, varName = i, value = na_value, valLabel = na_label)
GADSdat_out <- changeMissings(GADSdat_out, varName = i, value = na_value, missings = "miss")
}
GADSdat_out
}
# count text variables, give missings if more than x left
max_num_strings2NA <- function(dat, max_num, na_value) {
#dat[, vars] <- ifelse(!is.na(dat[, max_num]), yes = NA, no = dat[, vars])
stopifnot(is.numeric(max_num) && length(max_num) == 1)
stopifnot(ncol(dat) > 1)
max_var <- names(dat)[max_num + 1]
if(max_num >= ncol(dat)) return(dat)
for(i in seq(nrow(dat))) {
if(!is.na(dat[i, max_var])) {
dat[i, ] <- na_value
}
}
dat
}