-
Notifications
You must be signed in to change notification settings - Fork 451
/
2011 single-year - analysis examples.R
233 lines (148 loc) · 7.62 KB
/
2011 single-year - analysis examples.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# analyze survey data for free (http://asdfree.com) with the r language
# american community survey
# 2011 person and household files
# # # # # # # # # # # # # # # # #
# # block of code to run this # #
# # # # # # # # # # # # # # # # #
# library(downloader)
# setwd( 'C:/My Directory/ACS/' )
# source_url( "https://raw.githubusercontent.com/ajdamico/asdfree/master/American%20Community%20Survey/2011%20single-year%20-%20analysis%20examples.R" , prompt = FALSE , echo = TRUE )
# # # # # # # # # # # # # # #
# # end of auto-run block # #
# # # # # # # # # # # # # # #
# contact me directly for free help or for paid consulting work
# anthony joseph damico
# ajdamico@gmail.com
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#####################################################################################################################################
# prior to running this analysis script, the acs 2011 single-year file must be loaded as a monet database-backed survey object #
# on the local machine. running the 2005-2011 download and create database script will create a monet database containing this file #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# https://github.com/ajdamico/asdfree/blob/master/American%20Community%20Survey/download%20all%20microdata.R #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# that script will create a file "acs2011_1yr.rda" in C:/My Directory/ACS or wherever the working directory was set for the program #
#####################################################################################################################################
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
library(survey) # load survey package (analyzes complex design surveys)
library(MonetDBLite)
library(DBI) # load the DBI package (implements the R-database coding)
# load the desired american community survey monet database-backed complex sample design objects
# uncomment one of these lines by removing the `#` at the front..
load( 'acs2011_1yr.rda' ) # analyze the 2011 single-year acs
# load( 'acs2010_1yr.rda' ) # analyze the 2010 single-year acs
# load( 'acs2010_3yr.rda' ) # analyze the 2008-2010 three-year acs
# load( 'acs2010_5yr.rda' ) # analyze the 2006-2010 five-year acs
# note: this r data file should already contain both the merged (person + household) and household-only designs
# connect the complex sample designs to the monet database #
acs.m <- open( acs.m.design , driver = MonetDBLite() ) # merged design
acs.h <- open( acs.h.design , driver = MonetDBLite() ) # household-only design
###########################
# variable recode example #
###########################
# construct a new age category variable in the dataset: 0-4, 5-9, 10-14...55-59, 60-64, 65+
acs.m <- update( acs.m , agecat = 1 + findInterval( agep , seq( 5 , 65 , 5 ) ) )
# print the distribution of that age category
svymean( ~ factor( agecat ) , acs.m )
################################################
# ..and immediately start the example analyses #
################################################
# count the total (unweighted) number of records in acs #
# simply use the nrow function..
nrow( acs.m )
# ..on the svrepdesign object
class( acs.m )
# name the database files in the "MonetDB" folder of the current working directory
dbfolder <- paste0( getwd() , "/MonetDB" )
# open the connection to the monetdblite database
db <- dbConnect( MonetDBLite::MonetDBLite() , dbfolder )
# perform the same unweighted count directly from the sql table
# stored inside the monet database on your hard disk (as opposed to RAM)
dbGetQuery( db , "SELECT COUNT(*) AS num_records FROM acs2011_1yr_m" )
# count the total (unweighted) number of records in acs #
# broken out by state #
# note: this is easiest by simply running a sql query on the monet database directly
dbGetQuery( db , "SELECT st , COUNT(*) as num_records FROM acs2011_1yr_m GROUP BY st" )
# count the weighted number of individuals in acs #
# the population of the united states (including group quarters residents: both institionalized and non-institutionalized) #
svytotal( ~one , acs.m )
# note that this is exactly equivalent to summing up the weight variable
# from the original database (.db) file connection
dbGetQuery( db , "SELECT SUM( pwgtp ) AS sum_weights FROM acs2011_1yr_m" )
# the population of the united states #
# by state
svytotal( ~one , acs.m , byvar = ~st )
# note: the above command is one example of how the r survey package differs from the r survey package
# calculate the mean of a linear variable #
# average age - nationwide
svymean( ~agep , acs.m )
# by state
svymean( ~agep , acs.m , byvar = ~st )
# calculate the distribution of a categorical variable #
# first, force the variable to be a factor class
acs.m <- update( acs.m , hicov = factor( hicov ) )
# percent uninsured - nationwide
svymean( ~hicov , acs.m )
# by state
svyby( ~hicov , ~st , acs.m , svymean )
# calculate the median and other percentiles #
# 25th, median, and 75th percentile of age of residents of the united states
svyquantile( ~agep , acs.m , c( .25 , .5 , .75 ) )
######################
# subsetting example #
######################
# restrict the acs.m object to females only
acs.m.female <- subset( acs.m , sex == 2 )
# now any of the above commands can be re-run
# using the acs.m.female object
# instead of the acs.m object
# in order to analyze females only
# calculate the mean of a linear variable #
# average age - nationwide, restricted to females
svymean( ~agep , acs.m.female )
# median age - nationwide, restricted to females
svyquantile( ~agep , acs.m.female , 0.5 )
###################
# export examples #
###################
# calculate the distribution of a categorical variable #
# by region of the country
# store the results into a new object
coverage.by.region <- svyby( ~hicov , ~region , acs.m , svymean )
# print the results to the screen
coverage.by.region
# now you have the results saved into a new svyby object..
class( coverage.by.region )
# print only the statistics (coefficients) to the screen
coef( coverage.by.region )
# print only the standard errors to the screen
SE( coverage.by.region )
# this object can be coerced (converted) to a data frame..
coverage.by.region <- data.frame( coverage.by.region )
# ..and then immediately exported as a comma-separated value file
# into your current working directory
write.csv( coverage.by.region , "coverage by region.csv" )
# ..or trimmed to only contain the values you need.
# here's the uninsured percentage by region,
# with accompanying standard errors
uninsured.rate.by.region <-
coverage.by.region[ , c( 1 , 3 , 5 ) ]
# print the new results to the screen
uninsured.rate.by.region
# this can also be exported as a comma-separated value file
# into your current working directory
write.csv( uninsured.rate.by.region , "uninsured rate by region.csv" )
# ..or directly made into a bar plot
barplot(
uninsured.rate.by.region[ , 1 ] ,
main = "Uninsured Rate by Region of the Country" ,
names.arg = c( "Northeast" , "Midwest" , "South" , "West" , "Puerto Rico" ) ,
ylim = c( 0 , .40 )
)
############################
# end of analysis examples #
############################
# close the connection to the two svrepdesign design objects
close( acs.m )
close( acs.h )
# disconnect from the current monet database
dbDisconnect( db , shutdown = TRUE )