Join GitHub today
GitHub is home to over 50 million developers working together to host and review code, manage projects, and build software together.
Sign up| # analyze survey data for free (http://asdfree.com) with the r language | |
| # american community survey | |
| # 2011 person and household files | |
| # # # # # # # # # # # # # # # # # | |
| # # block of code to run this # # | |
| # # # # # # # # # # # # # # # # # | |
| # library(downloader) | |
| # setwd( 'C:/My Directory/ACS/' ) | |
| # source_url( "https://raw.githubusercontent.com/ajdamico/asdfree/master/American%20Community%20Survey/2011%20single-year%20-%20analysis%20examples.R" , prompt = FALSE , echo = TRUE ) | |
| # # # # # # # # # # # # # # # | |
| # # end of auto-run block # # | |
| # # # # # # # # # # # # # # # | |
| # contact me directly for free help or for paid consulting work | |
| # anthony joseph damico | |
| # ajdamico@gmail.com | |
| # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
| ##################################################################################################################################### | |
| # prior to running this analysis script, the acs 2011 single-year file must be loaded as a monet database-backed survey object # | |
| # on the local machine. running the 2005-2011 download and create database script will create a monet database containing this file # | |
| # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
| # https://github.com/ajdamico/asdfree/blob/master/American%20Community%20Survey/download%20all%20microdata.R # | |
| # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
| # that script will create a file "acs2011_1yr.rda" in C:/My Directory/ACS or wherever the working directory was set for the program # | |
| ##################################################################################################################################### | |
| # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # | |
| library(survey) # load survey package (analyzes complex design surveys) | |
| library(MonetDBLite) | |
| library(DBI) # load the DBI package (implements the R-database coding) | |
| # load the desired american community survey monet database-backed complex sample design objects | |
| # uncomment one of these lines by removing the `#` at the front.. | |
| load( 'acs2011_1yr.rda' ) # analyze the 2011 single-year acs | |
| # load( 'acs2010_1yr.rda' ) # analyze the 2010 single-year acs | |
| # load( 'acs2010_3yr.rda' ) # analyze the 2008-2010 three-year acs | |
| # load( 'acs2010_5yr.rda' ) # analyze the 2006-2010 five-year acs | |
| # note: this r data file should already contain both the merged (person + household) and household-only designs | |
| # connect the complex sample designs to the monet database # | |
| acs.m <- open( acs.m.design , driver = MonetDBLite() ) # merged design | |
| acs.h <- open( acs.h.design , driver = MonetDBLite() ) # household-only design | |
| ########################### | |
| # variable recode example # | |
| ########################### | |
| # construct a new age category variable in the dataset: 0-4, 5-9, 10-14...55-59, 60-64, 65+ | |
| acs.m <- update( acs.m , agecat = 1 + findInterval( agep , seq( 5 , 65 , 5 ) ) ) | |
| # print the distribution of that age category | |
| svymean( ~ factor( agecat ) , acs.m ) | |
| ################################################ | |
| # ..and immediately start the example analyses # | |
| ################################################ | |
| # count the total (unweighted) number of records in acs # | |
| # simply use the nrow function.. | |
| nrow( acs.m ) | |
| # ..on the svrepdesign object | |
| class( acs.m ) | |
| # name the database files in the "MonetDB" folder of the current working directory | |
| dbfolder <- paste0( getwd() , "/MonetDB" ) | |
| # open the connection to the monetdblite database | |
| db <- dbConnect( MonetDBLite::MonetDBLite() , dbfolder ) | |
| # perform the same unweighted count directly from the sql table | |
| # stored inside the monet database on your hard disk (as opposed to RAM) | |
| dbGetQuery( db , "SELECT COUNT(*) AS num_records FROM acs2011_1yr_m" ) | |
| # count the total (unweighted) number of records in acs # | |
| # broken out by state # | |
| # note: this is easiest by simply running a sql query on the monet database directly | |
| dbGetQuery( db , "SELECT st , COUNT(*) as num_records FROM acs2011_1yr_m GROUP BY st" ) | |
| # count the weighted number of individuals in acs # | |
| # the population of the united states (including group quarters residents: both institionalized and non-institutionalized) # | |
| svytotal( ~one , acs.m ) | |
| # note that this is exactly equivalent to summing up the weight variable | |
| # from the original database (.db) file connection | |
| dbGetQuery( db , "SELECT SUM( pwgtp ) AS sum_weights FROM acs2011_1yr_m" ) | |
| # the population of the united states # | |
| # by state | |
| svytotal( ~one , acs.m , byvar = ~st ) | |
| # note: the above command is one example of how the r survey package differs from the r survey package | |
| # calculate the mean of a linear variable # | |
| # average age - nationwide | |
| svymean( ~agep , acs.m ) | |
| # by state | |
| svymean( ~agep , acs.m , byvar = ~st ) | |
| # calculate the distribution of a categorical variable # | |
| # first, force the variable to be a factor class | |
| acs.m <- update( acs.m , hicov = factor( hicov ) ) | |
| # percent uninsured - nationwide | |
| svymean( ~hicov , acs.m ) | |
| # by state | |
| svyby( ~hicov , ~st , acs.m , svymean ) | |
| # calculate the median and other percentiles # | |
| # 25th, median, and 75th percentile of age of residents of the united states | |
| svyquantile( ~agep , acs.m , c( .25 , .5 , .75 ) ) | |
| ###################### | |
| # subsetting example # | |
| ###################### | |
| # restrict the acs.m object to females only | |
| acs.m.female <- subset( acs.m , sex == 2 ) | |
| # now any of the above commands can be re-run | |
| # using the acs.m.female object | |
| # instead of the acs.m object | |
| # in order to analyze females only | |
| # calculate the mean of a linear variable # | |
| # average age - nationwide, restricted to females | |
| svymean( ~agep , acs.m.female ) | |
| # median age - nationwide, restricted to females | |
| svyquantile( ~agep , acs.m.female , 0.5 ) | |
| ################### | |
| # export examples # | |
| ################### | |
| # calculate the distribution of a categorical variable # | |
| # by region of the country | |
| # store the results into a new object | |
| coverage.by.region <- svyby( ~hicov , ~region , acs.m , svymean ) | |
| # print the results to the screen | |
| coverage.by.region | |
| # now you have the results saved into a new svyby object.. | |
| class( coverage.by.region ) | |
| # print only the statistics (coefficients) to the screen | |
| coef( coverage.by.region ) | |
| # print only the standard errors to the screen | |
| SE( coverage.by.region ) | |
| # this object can be coerced (converted) to a data frame.. | |
| coverage.by.region <- data.frame( coverage.by.region ) | |
| # ..and then immediately exported as a comma-separated value file | |
| # into your current working directory | |
| write.csv( coverage.by.region , "coverage by region.csv" ) | |
| # ..or trimmed to only contain the values you need. | |
| # here's the uninsured percentage by region, | |
| # with accompanying standard errors | |
| uninsured.rate.by.region <- | |
| coverage.by.region[ , c( 1 , 3 , 5 ) ] | |
| # print the new results to the screen | |
| uninsured.rate.by.region | |
| # this can also be exported as a comma-separated value file | |
| # into your current working directory | |
| write.csv( uninsured.rate.by.region , "uninsured rate by region.csv" ) | |
| # ..or directly made into a bar plot | |
| barplot( | |
| uninsured.rate.by.region[ , 1 ] , | |
| main = "Uninsured Rate by Region of the Country" , | |
| names.arg = c( "Northeast" , "Midwest" , "South" , "West" , "Puerto Rico" ) , | |
| ylim = c( 0 , .40 ) | |
| ) | |
| ############################ | |
| # end of analysis examples # | |
| ############################ | |
| # close the connection to the two svrepdesign design objects | |
| close( acs.m ) | |
| close( acs.h ) | |
| # disconnect from the current monet database | |
| dbDisconnect( db , shutdown = TRUE ) | |