-
Notifications
You must be signed in to change notification settings - Fork 2
/
extractData2.R
175 lines (162 loc) · 9.07 KB
/
extractData2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#### extractData
#############################################################################
#' Extract Data 2
#'
#' Extract \code{data.frame} from a \code{GADSdat} object for analyses in \code{R}. Per default, missing codes are applied but
#' value labels are dropped. Alternatively, value labels can be selectively applied via
#' \code{labels2character}, \code{labels2factor}, and \code{labels2ordered}.
#' For extracting meta data see \code{\link{extractMeta}}.
#'
#' A \code{GADSdat} object includes actual data (\code{GADSdat$dat}) and the corresponding meta data information
#' (\code{GADSdat$labels}). \code{extractData2} extracts the data and applies relevant meta data on value level
#' (missing conversion, value labels),
#' so the data can be used for analyses in \code{R}. Variable labels are retained as \code{label} attributes on column level.
#'
#' If \code{factor} are extracted via \code{labels2factor} or \code{labels2ordered}, an attempt is made to preserve the underlying integers.
#' If this is not possible, a warning is issued.
#' As \code{SPSS} has almost no limitations regarding the underlying values of labeled
#' integers and \code{R}'s \code{factor} format is very strict (no \code{0}, only integers increasing by \code{+ 1}),
#' this procedure can lead to frequent problems.
#'
#'@param GADSdat A \code{GADSdat} object.
#'@param convertMiss Should values tagged as missing values be recoded to \code{NA}?
#'@param labels2character For which variables should values be recoded to their labels? The resulting variables
#'are of type \code{character}.
#'@param labels2factor For which variables should values be recoded to their labels? The resulting variables
#'are of type \code{factor}.
#'@param labels2ordered For which variables should values be recoded to their labels? The resulting variables
#'are of type \code{ordered}.
#'@param dropPartialLabels Should value labels for partially labeled variables be dropped?
#'If \code{TRUE}, the partial labels will be dropped. If \code{FALSE}, the variable will be converted
#'to the class specified in \code{labels2character}, \code{labels2factor}, or \code{labels2ordered}.
#'
#'@return Returns a data frame.
#'
#'@examples
#'# Extract Data for Analysis
#'dat <- extractData2(pisa)
#'
#'# convert only some variables to character, all others remain numeric
#'dat <- extractData2(pisa, labels2character = c("schtype", "ganztag"))
#'
#'# convert only some variables to factor, all others remain numeric
#'dat <- extractData2(pisa, labels2factor = c("schtype", "ganztag"))
#'
#'# convert all labeled variables to factors
#'dat <- extractData2(pisa, labels2factor = namesGADS(pisa))
#'
#'# convert somme variables to factor, some to character
#'dat <- extractData2(pisa, labels2character = c("schtype", "ganztag"),
#' labels2factor = c("migration"))
#'
#'@export
extractData2 <- function(GADSdat,
convertMiss = TRUE,
labels2character = NULL,
labels2factor = NULL,
labels2ordered = NULL,
dropPartialLabels = TRUE) {
UseMethod("extractData2")
}
#'@export
extractData2.GADSdat <- function(GADSdat,
convertMiss = TRUE,
labels2character = NULL,
labels2factor = NULL,
labels2ordered = NULL,
dropPartialLabels = TRUE) {
check_GADSdat(GADSdat)
# input validation
if(!is.null(labels2character)) check_vars_in_GADSdat(GADSdat, labels2character)
if(!is.null(labels2factor)) check_vars_in_GADSdat(GADSdat, labels2factor)
#browser()
if(!is.null(labels2character) && !is.null(labels2factor)) {
dups <- c(labels2character, labels2factor)[duplicated(c(labels2character, labels2factor))]
if(length(dups) > 0) stop("The following variables are both in 'labels2character' and 'labels2factor': ",
paste(dups, collapse = ", "))
}
if(!is.null(labels2character) && !is.null(labels2ordered)) {
dups <- c(labels2character, labels2ordered)[duplicated(c(labels2character, labels2ordered))]
if(length(dups) > 0) stop("The following variables are both in 'labels2character' and 'labels2ordered': ",
paste(dups, collapse = ", "))
}
if(!is.null(labels2ordered) && !is.null(labels2factor)) {
dups <- c(labels2ordered, labels2factor)[duplicated(c(labels2ordered, labels2factor))]
if(length(dups) > 0) stop("The following variables are both in 'labels2ordered' and 'labels2factor': ",
paste(dups, collapse = ", "))
}
dat <- GADSdat$dat
labels <- GADSdat$labels
## missings
if(identical(convertMiss, TRUE)) dat <- miss2NA(GADSdat)
## labels
dat <- labels2values2(dat = dat, labels = labels, convertMiss = convertMiss, dropPartialLabels = dropPartialLabels,
labels2character = labels2character, labels2factor = labels2factor, labels2ordered = labels2ordered)
## varLabels
dat <- varLabels_as_labels(dat = dat, labels = labels)
dat
}
#'@export
extractData2.trend_GADSdat <- function(GADSdat,
convertMiss = TRUE,
labels2character = NULL,
labels2factor = NULL,
labels2ordered = NULL,
dropPartialLabels = TRUE) {
# Input validation
check_trend_GADSdat(GADSdat)
if(!is.null(labels2character) && !is.character(labels2character)) stop("'labels2character' must be a character vector.")
if(!is.null(labels2factor) && !is.character(labels2factor)) stop("'labels2factor' must be a character vector.")
if(!is.null(labels2ordered) && !is.character(labels2ordered)) stop("'labels2ordered' must be a character vector.")
all_GADSdat_names <- unique(unlist(namesGADS(GADSdat)))
check_vars_in_vector(all_GADSdat_names, vars = labels2character, vec_nam = "GADSdats")
check_vars_in_vector(all_GADSdat_names, vars = labels2factor, vec_nam = "GADSdats")
check_vars_in_vector(all_GADSdat_names, vars = labels2ordered, vec_nam = "GADSdats")
dat_list <- lapply(names(GADSdat$datList), function(nam) {
gads <- extractGADSdat(all_GADSdat = GADSdat, name = nam)
single_labels2character <- labels2character[labels2character %in% namesGADS(gads)]
single_labels2factor <- labels2factor[labels2factor %in% namesGADS(gads)]
single_labels2ordered <- labels2ordered[labels2ordered %in% namesGADS(gads)]
dat <- extractData2(gads, convertMiss = convertMiss, labels2character = single_labels2character, labels2factor = single_labels2factor,
labels2ordered = single_labels2ordered, dropPartialLabels = dropPartialLabels)
dat
})
dat_out <- do.call(plyr::rbind.fill, dat_list)
dat_out[, c(setdiff(names(dat_out), "year"), "year")]
}
# converts labels to values
labels2values2 <- function(dat, labels, convertMiss, dropPartialLabels, labels2character, labels2factor, labels2ordered) {
if(is.null(labels2character) && is.null(labels2factor)) return(dat)
# Which variables should their value labels be applied to?
convertVariables <- c(labels2character, labels2factor, labels2ordered)
#stopifnot(is.character(convertVariables) && length(convertVariables) > 0)
change_labels <- labels[labels[, "varName"] %in% convertVariables, ] # careful, from here use only change_labels!
# check value labels, remove incomplete labels from insertion to protect variables
if(identical(dropPartialLabels, TRUE)) {
drop_labels <- unlist(lapply(unique(labels$varName), check_labels, dat = dat, labels = labels,
convertMiss = convertMiss))
change_labels <- change_labels[!change_labels$varName %in% drop_labels, ]
}
# early return, if no values are to be recoded
if(nrow(change_labels) == 0) return(dat)
# convert labels into values (use recode function from applyChangeMeta)
change_table <- change_labels[, c("varName", "value", "valLabel")]
names(change_table) <- c("varName", "value", "value_new")
dat2 <- recode_dat(dat, changeTable = change_table)
# identify modified variables
is_character_old <- unlist(lapply(dat, function(var) is.character(var)))
is_character_new <- unlist(lapply(dat2, function(var) is.character(var)))
changed_variables <- names(dat2)[is_character_new & !is_character_old]
# convert characters to factor if specified (keep ordering if possible)
#if(!is.null(labels2ordered)) browser()
changed_variables_labels2factor <- intersect(labels2factor, changed_variables)
changed_variables <- setdiff(changed_variables, changed_variables_labels2factor)
if(length(changed_variables_labels2factor) > 0) {
dat2 <- char2fac(dat = dat2, labels = labels, vars = changed_variables_labels2factor, convertMiss = convertMiss, ordered = FALSE)
}
changed_variables_labels2ordered <- intersect(labels2ordered, changed_variables)
if(length(changed_variables_labels2ordered) > 0) {
dat2 <- char2fac(dat = dat2, labels = labels, vars = changed_variables_labels2ordered, convertMiss = convertMiss, ordered = TRUE)
}
dat2
}