/
2010 consolidated - analyze with brr.R
425 lines (308 loc) · 11.8 KB
/
2010 consolidated - analyze with brr.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
# analyze us government survey data with the r language
# medical expenditure panel survey
# 2010 consolidated
# if you have never used the r language before,
# watch this two minute video i made outlining
# how to run this script from start to finish
# http://www.screenr.com/Zpd8
# anthony joseph damico
# ajdamico@gmail.com
# if you use this script for a project, please send me a note
# it's always nice to hear about how people are using this stuff
# for further reading on cross-package comparisons, see:
# http://journal.r-project.org/archive/2009-2/RJournal_2009-2_Damico.pdf
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#############################################################################################################################################################
# prior to running this analysis script, the linkage brr file and 2010 consolidated file must be loaded as an r data file (.rda) file on the local machine. #
# running the 1996-2010 household component - download all microdata.R script will create an R data file (.rda) #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# https://raw.github.com/ajdamico/usgsd/master/Medical%20Expenditure%20Panel%20Survey/1996-2010%20household%20component%20-%20download%20all%20microdata.R #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# that script will create files "linkage - brr.rda" and "2010 - consolidated.rda" in C:/My Directory/MEPS (or wherever the working directory was chosen) #
#############################################################################################################################################################
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
###############################################
# balanced repeated replication (brr) version #
# this script uses the brr method to calculate standard errors
# brr has the disadvantage of being computationally more difficult
# and the advantage of producing standard errors or confidence intervals
# on percentile statistics
# (for example, tsl cannot compute the confidence interval around a median)
# if you are not sure which method to use, use this brr script instead of tsl
# available in the same folder
# the statistics (means, medians, percents, and counts) from brr and tsl designs
# will match exactly. the standard errors and confidence intervals
# will be slightly different. both methods are considered valid.
##############################################################################
# Analyze the 2010 Medical Expenditure Panel Survey consolidated file with R #
##############################################################################
# set your working directory.
# the MEPS 2010 data file will be stored here
# after downloading and importing it.
# use forward slashes instead of back slashes
# set your working directory.
# this directory must contain the MEPS 2010 consolidated (.rda) file
# as well as the MEPS linkage - brr (.rda) file
# created by the R program specified above
# use forward slashes instead of back slashes
# uncomment this line by removing the `#` at the front..
# setwd( "C:/My Directory/MEPS/" )
# ..in order to set your current working directory
# remove the # in order to run this install.packages line only once
# install.packages( "survey" )
require(survey) # load survey package (analyzes complex design surveys)
# set R to produce conservative standard errors instead of crashing
# http://faculty.washington.edu/tlumley/survey/exmample-lonely.html
options( survey.lonely.psu = "adjust" )
# this setting matches the MISSUNIT option in SUDAAN
# if this option is set to TRUE
# R will exactly match SUDAAN results and Stata with the MSE option results
options( survey.replicates.mse = TRUE )
# otherwise if it is commented out or set to FALSE
# R will exactly match Stata without the MSE option results
# Stata svyset command notes can be found here: http://www.stata.com/help.cgi?svyset
# load the consolidated.2010 data frame into an R data frame
load( "2010 - consolidated.rda" )
# load the linkage - brr.rda file into an R data frame
load( "linkage - brr.rda" )
####################################
# if your computer runs out of RAM #
# if you get a memory error #
####################################
# comment out these lines if you'd rather not restrict the MEPS 10 file
# to only the columns you expect to use in the analysis
# the MEPS 2010 consolidated file has almost 2,000 different columns
# most analyses only use a small fraction of those
# by removing the columns not necessary for the analysis,
# lots of RAM gets freed up
# create a character vector containing
# the variables you need for the analysis
KeepVars <-
c(
# unique identifiers
"DUPERSID" , "PANEL" ,
# cluster and strata variables used for complex survey design
"VARPSU" , "VARSTR" ,
# 2010 weight
"PERWT10F" ,
# annualized insurance coverage variable
"INS10X" ,
# total annual medical expenditure variable
"TOTEXP10" ,
# region of the country variable
"REGION10" ,
# gender variable
"SEX"
)
# restrict the consolidated data table to
# only the columns specified above
consolidated.2010 <-
consolidated.2010[ , KeepVars ]
# clear up RAM - garbage collection function
gc()
############################
# end of RAM-clearing code #
############################
#################################################
# merge consolidated file with brr weights file #
#################################################
# remove columns DUID and PID from the brr file
# or they will create duplicate column names in the merged data frame
brr <-
brr[ , !( names( brr ) %in% c( "DUID" , "PID" ) ) ]
# merge the consolidated file
# with the brr file
MEPS.10.consolidated.with.brr.df <-
merge(
consolidated.2010 ,
brr ,
by = c( "DUPERSID" , "PANEL" )
)
# confirm that the number of records in the 2010 consolidated file
# matches the number of records in the merged file
if ( nrow( MEPS.10.consolidated.with.brr.df ) != nrow( consolidated.2010 ) )
stop( "problem with merge - merged file should have the same number of records as the original consolidated file" )
###################################################
# survey design for balanced repeated replication #
###################################################
# create survey design object with MEPS design information
# using existing data frame of MEPS data
meps.brr.design <-
svrepdesign(
data = MEPS.10.consolidated.with.brr.df ,
weights = ~PERWT10F ,
type = "BRR" ,
combined.weights = F ,
repweights = "BRR[1-9]+"
)
# notice the 'meps.brr.design' object used in all subsequent analysis commands
# remove two of the original data frames from RAM
# since they're no longer of value
rm( MEPS.10.consolidated.with.brr.df )
rm( brr )
# clear up RAM
gc()
#####################
# analysis examples #
#####################
# count the total (unweighted) number of records in meps #
# simply use the nrow function
nrow( meps.brr.design )
# the nrow function which works on both data frame objects..
class( consolidated.2010 )
# ..and survey design objects
class( meps.brr.design )
# count the total (unweighted) number of records in meps #
# broken out by region of the country #
svyby(
~TOTEXP10 ,
~REGION10 ,
meps.brr.design ,
unwtd.count
)
# count the weighted number of individuals in meps #
# add a new variable 'one' that simply has the number 1 for each record #
meps.brr.design <-
update(
one = 1 ,
meps.brr.design
)
# the civilian, non-institutionalized population of the united states #
svytotal(
~one ,
meps.brr.design
)
# note that this is exactly equivalent to summing up the weight variable
# from the original MEPS data frame
# (assuming this data frame was not cleared out of RAM above)
sum( consolidated.2010$PERWT10F )
# the civilian, non-institutionalized population of the united states #
# by region of the country
svyby(
~one ,
~REGION10 ,
meps.brr.design ,
svytotal
)
# calculate the mean of a linear variable #
# average medical expenditure - nationwide
svymean(
~TOTEXP10 ,
design = meps.brr.design
)
# by region of the country
svyby(
~TOTEXP10 ,
~REGION10 ,
design = meps.brr.design ,
svymean
)
# calculate the distribution of a categorical variable #
# INS10X should be treated as a factor (categorical) variable
# instead of a numeric (linear) variable
# this update statement converts it.
# the svyby command below will not run without this
meps.brr.design <-
update(
INS10X = factor( INS10X ) ,
meps.brr.design
)
# percent uninsured - nationwide
svymean(
~INS10X ,
design = meps.brr.design
)
# by region of the country
svyby(
~INS10X ,
~REGION10 ,
design = meps.brr.design ,
svymean
)
# calculate the median and other percentiles #
# note that unlike a taylor-series survey design
# the brr design does allow for
# calculation of standard errors
# minimum, 25th, 50th, 75th, maximum
# medical expenditure in the united states
svyquantile(
~TOTEXP10 ,
design = meps.brr.design ,
c( 0 , .25 , .5 , .75 , 1 )
)
# by region of the country
svyby(
~TOTEXP10 ,
~REGION10 ,
design = meps.brr.design ,
svyquantile ,
c( 0 , .25 , .5 , .75 , 1 ) ,
ci = T
)
######################
# subsetting example #
######################
# restrict the meps.brr.design object to
# females only
meps.brr.design.female <-
subset(
meps.brr.design ,
SEX %in% 2
)
# now any of the above commands can be re-run
# using the meps.brr.design.female object
# instead of the meps.brr.design object
# in order to analyze females only
# calculate the mean of a linear variable #
# average medical expenditure - nationwide, restricted to females
svymean(
~TOTEXP10 ,
design = meps.brr.design.female
)
###################
# export examples #
###################
# calculate the distribution of a categorical variable #
# by region of the country
# store the results into a new object
coverage.by.region <-
svyby(
~INS10X ,
~REGION10 ,
design = meps.brr.design ,
svymean
)
# print the results to the screen
coverage.by.region
# now you have the results saved into a new object of type "svyby"
class( coverage.by.region )
# print only the statistics (coefficients) to the screen
coef( coverage.by.region )
# print only the standard errors to the screen
SE( coverage.by.region )
# this object can be coerced (converted) to a data frame..
coverage.by.region <- data.frame( coverage.by.region )
# ..and then immediately exported as a comma-separated value file
# into your current working directory
write.csv( coverage.by.region , "coverage by region.csv" )
# ..or trimmed to only contain the values you need.
# here's the uninsured percentage by region,
# with accompanying standard errors
uninsured.rate.by.region <-
coverage.by.region[ 2:5 , c( "REGION10" , "INS10X2" , "se3" ) ]
# that's rows 2 through 5, and the three specified columns
# print the new results to the screen
uninsured.rate.by.region
# this can also be exported as a comma-separated value file
# into your current working directory
write.csv( uninsured.rate.by.region , "uninsured rate by region.csv" )
# ..or directly made into a bar plot
barplot(
uninsured.rate.by.region[ , 2 ] ,
main = "Uninsured Rate by Region of the Country" ,
names.arg = c( "Northeast" , "Midwest" , "South" , "West" ) ,
ylim = c( 0 , .25 )
)
# for more details on how to work with data in r
# check out my two minute tutorial video site
# http://www.twotorials.com/