-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_cleaning.R
82 lines (69 loc) · 3.67 KB
/
data_cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
library(dplyr)
library(readxl)
#install.packages("sjlabelled")
library(sjlabelled)
#https://rdrr.io/cran/sjlabelled/man/remove_all_labels.html
#The above package was helpful to deal with the labelled 2018 dataset, before
#combining all of them.
load("F00004532-Latinobarometro_2015_r/Latinobarometro_2015_Eng.rdata")
load("F00005906-Latinobarometro2016_r/Latinobarometro2016Eng_v20170205.rdata")
load("F00006501-Latinobarometro2017_r/Latinobarometro2017Eng_v20180117.rdata")
Latinobarometro_2018_Esp_R_v20190303 <- readRDS("F00008548-Latinobarometro_2018_Esp_R_v20190303/Latinobarometro_2018_Esp_R_v20190303.rds")
load("LAT_Latinobarometro2013_r/Latinobarometro2013Eng.rdata")
F00008653_SerieDeTiempo_1995_2018 <- read_xlsx("F00008653-SerieDeTiempo_1995_2018.xlsx")
codebook <- dplyr::select(F00008653_SerieDeTiempo_1995_2018, -('v1995':'v2011')) %>%
#Dropping years that will not be analyzed.
rename_with(~sub("v", "", .x), .cols = 9:13) #Standardizing year variables.
#Still standardizing year variables.
latinobarometro_2015 <- mutate(Latinobarometro_2015_Eng, numinves = 2015, .keep = "unused")
latinobarometro_2013 <- mutate(Latinobarometro2013Eng, numinves = 2013, .keep = "unused")
#Function that standardizes column names defensively before filtering only data from Brazil.
brazil_filter <- function(data_year){
upper_case_colnames <- rename_with(data_year, .fn = toupper)
only_brazil <- filter(upper_case_colnames, IDENPA == 76)
new_name <- paste('BRAZIL', as.character(data_year[1, 1]), sep = "_") #Creates new dataframe.
assign(new_name, only_brazil, envir=.GlobalEnv)
#Since each original dataframe only has data from a single year,
#it makes sense to extract names from the first cell in their year column.
}
brazil_filter(latinobarometro_2013)
brazil_filter(latinobarometro_2015)
brazil_filter(Latinobarometro2016Eng_v20170205)
brazil_filter(Latinobarometro2017Eng_v20180117)
brazil_filter(Latinobarometro_2018_Esp_R_v20190303)
#I made a list of variables of interest, then filtered the codebook.
vars_interest <- c('A_001_001', 'A_003_031', 'H_002_101', 'H_002_111', 'H_002_161',
'I_001_001', 'S_700', 'S_701', 'X_002')
codebook_reduced <- filter(codebook, `Indice` %in% vars_interest)
#The following function finds out the year of each data set by selecting its first cell.
#Like previously, it works because the data sets are divided by year.
#The year selects the desirable variables through its column in the codebook_reduced.
#I finish standardizing the variables with a function to substitute each variable
#by its 'Indice' correspondent.
variable_selection <- function(whole_data, first = FALSE){
year_function <- as.character(whole_data[1, 1])
filtered_data <- dplyr::select(whole_data, codebook_reduced[[year_function]])
for (i in vars_interest) {
old_var <- as.character(codebook_reduced[codebook_reduced$'Indice' == i,][year_function])
filtered_data <- rename_with(filtered_data, ~sub(paste(old_var), paste(i), .x))
}
if (first == TRUE){
assign('combining_dataset', filtered_data, envir=.GlobalEnv)
#Start a combined dataset,
}
else{
if (year_function == '2018'){
filtered_data <- remove_all_labels(filtered_data)
#Removes labels from 2018 dataset.
}
new_name <- paste('BRAZIL', year_function, 'FILTERED', sep = "_") #Creates new dataframe.
assign(new_name, filtered_data, envir=.GlobalEnv)
}
}
variable_selection(BRAZIL_2013, first = TRUE)
variable_selection(BRAZIL_2018)
variable_selection(BRAZIL_2017)
variable_selection(BRAZIL_2016)
variable_selection(BRAZIL_2015)
full_dataset <- bind_rows(combining_dataset, BRAZIL_2018_FILTERED, BRAZIL_2017_FILTERED,
BRAZIL_2016_FILTERED, BRAZIL_2015_FILTERED)