Skip to content

Commit

Permalink
move arf to monetdblite
Browse files Browse the repository at this point in the history
  • Loading branch information
ajdamico committed Jun 26, 2016
1 parent c680eec commit b8dae1e
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 37 deletions.
18 changes: 9 additions & 9 deletions Area Resource File/analysis examples.R
@@ -1,6 +1,6 @@
# analyze survey data for free (http://asdfree.com) with the r language
# area resource file
# 2013-2014
# 2014-2015

# # # # # # # # # # # # # # # # #
# # block of code to run this # #
Expand All @@ -26,13 +26,13 @@
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# https://github.com/ajdamico/asdfree/blob/master/Area%20Resource%20File/download.R #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# that script will create a file "arf2013.rda" with 'arf' in C:/My Directory/ARF #
# that script will create a file "arf2014.rda" with 'arf' in C:/My Directory/ARF #
###################################################################################
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #


# set your working directory.
# the ARF 2013-2014 data files should have been stored here
# the ARF 2014-2015 data files should have been stored here
# after running the program described above
# use forward slashes instead of back slashes

Expand All @@ -41,16 +41,16 @@
# ..in order to set your current working directory


# load the 2013-2014 ARF data file
load( "arf2013.rda" )
# load the 2014-2015 ARF data file
load( "arf2014.rda" )


# now the 'arf' data frame is available in memory..
# ..but has far too many variables to browse through
ncol( arf )


# the "AHRF 2013-2014 Technical Documentation.xls" file in the current working directory contains field labels
# the "AHRF 2014-2015 Technical Documentation.xls" file in the current working directory contains field labels
# so create a smaller data table with only a few columns of interest
# first, create a character vector containing only the columns you'll need:
variables.to.keep <-
Expand Down Expand Up @@ -103,11 +103,11 @@ length( unique( arf.sub$ssa ) )

# because many counties with fips codes do not have ssa county codes
# here's a few records where the ssa county code equals zero (missing)
head( arf.sub[ arf.sub$ssa == 0 , ] )
head( arf.sub[ arf.sub$ssa == '' , ] )


# you could print all of them to the screen
arf.sub[ arf.sub$ssa == 0 , ]
arf.sub[ arf.sub$ssa == '' , ]
# ..and find they're mostly the us territories.
# because territories have fips but not ssa county codes

Expand Down Expand Up @@ -174,7 +174,7 @@ nrow( fakedata )

# to merge the arf onto fakedata using the county ssa code,
# try limiting the arf to only records with a non-zero ssa code
arf.with.ssa <- subset( arf.sub , ssa != 0 )
arf.with.ssa <- subset( arf.sub , ssa != '' )
# count the number of records in fakedata
nrow( fakedata )
# perform the merge
Expand Down
68 changes: 43 additions & 25 deletions Area Resource File/download.R
Expand Up @@ -35,7 +35,7 @@


# remove the # in order to run this install.packages line only once
# install.packages( c( 'SAScii' , 'descr' , 'RSQLite' , 'downloader' , 'digest' ) )
# install.packages( c( 'SAScii' , 'descr' , 'MonetDBLite' , 'downloader' , 'digest' ) )



Expand All @@ -48,25 +48,26 @@


# load necessary libraries
library(RSQLite) # load RSQLite package (creates database files in R)
library(SAScii) # load the SAScii package (imports ascii data with a SAS script)
library(descr) # load the descr package (converts fixed-width files to delimited files)
library(foreign) # load foreign package (converts data files into R)
library(downloader) # downloads and then runs the source() function on scripts from github
library(DBI) # load the DBI package (sets up main SQL configuration and connectivity functions)
library(MonetDBLite) # load MonetDBLite package (creates database files in R)
library(SAScii) # load the SAScii package (imports ascii data with a SAS script)
library(descr) # load the descr package (converts fixed-width files to delimited files)
library(foreign) # load foreign package (converts data files into R)
library(downloader) # downloads and then runs the source() function on scripts from github


# load the read.SAScii.sqlite function (a variant of read.SAScii that creates a database directly)
source_url( "https://raw.githubusercontent.com/ajdamico/asdfree/master/SQLite/read.SAScii.sqlite.R" , prompt = FALSE )
source_url( "https://raw.githubusercontent.com/ajdamico/asdfree/master/MonetDB/read.SAScii.monetdb.R" , prompt = FALSE )

# store the downloaded file locally forever
source_url( "https://raw.githubusercontent.com/ajdamico/asdfree/master/Download%20Cache/download%20cache.R" , prompt = FALSE , echo = FALSE )

# create a temporary database file and another temporary file
temp.db <- tempfile()
# create a temporary file
tf <- tempfile()


# download the most current ARF file
# and save it as the temporary file
download.file( "http://datawarehouse.hrsa.gov/DataDownload/ARF/AHRF_2014-2015.zip" , tf , mode = 'wb' )
download_cached( "http://datawarehouse.hrsa.gov/DataDownload/ARF/AHRF_2014-2015.zip" , tf , mode = 'wb' )


# unzip all of the files in the downloaded .zip file into the current working directory
Expand All @@ -79,34 +80,51 @@ files <- unzip( tf , exdir = getwd() )
# identify ascii file on your local disk
fn <- files[ grep( '\\.asc' , files ) ]

# make an overwritten file
fn_ue <- gsub( "\\.asc" , "_ascii.asc" , fn )

# store the pre-run encoding configuration
pre_encoding <- getOption( "encoding" )

# switch the environment to ascii (very strict) encoding
options( encoding = "ASCII" )

# load in the whole file (in ASCII)
arf_load <- readLines( fn )

# overwrite the file with the stricter encoding
writeLines( arf_load , fn_ue )

# remove this object from memory & clear up RAM
rm( arf_load ) ; gc()

# restore the previous encoding settings
options( encoding = pre_encoding )

# identify sas (read-in) import instructions
sas_ri <- files[ grep( '\\.sas' , files ) ]


# create and connect to a temporary SQLite database
db <- dbConnect( SQLite() , temp.db )
# create and connect to a temporary MonetDBLite database
db <- dbConnect( MonetDBLite::MonetDBLite() )


# parse through the ARF without touching RAM #
read.SAScii.sqlite(
fn = fn ,
sas_ri = sas_ri ,
tl = TRUE , # convert all column names to lowercase?
tablename = 'arf' ,
conn = db
read.SAScii.monetdb(
fn = fn_ue ,
sas_ri = sas_ri ,
tl = TRUE , # convert all column names to lowercase?
tablename = 'arf' ,
conn = db ,
na_strings = "." # unlike most other datasets, na strings are dots
)


# read the ARF into RAM
arf <- dbReadTable( db , 'arf' )


# disconnect from the temporary SQLite database
dbDisconnect( db )

# and delete it
file.remove( temp.db )
# disconnect from the temporary MonetDBLite database
dbDisconnect( db , shutdown = TRUE )


# save the arf data table as an R data file (.rda)
Expand Down
9 changes: 6 additions & 3 deletions MonetDB/read.SAScii.monetdb.R
Expand Up @@ -36,12 +36,13 @@ read.SAScii.monetdb <-
# specifying this option creates the temporary file inside the folder specified
try_best_effort = FALSE ,
sas_stru = NULL ,
allow_zero_records = FALSE # by default, expect more than zero records to be imported.

allow_zero_records = FALSE , # by default, expect more than zero records to be imported.
na_strings = "" # by default, na strings are empty
) {
if( is.null( sas_ri ) & is.null( sas_stru ) ) stop( "either sas_ri= or sas_stru= must be specified" )
if( !is.null( sas_ri ) & !is.null( sas_stru ) ) stop( "either sas_ri= or sas_stru= must be specified, but not both" )

if( length( na_strings ) != 1 ) stop( "na_strings must have length of one" )

# before anything else, create the temporary files needed for this function to run
# if the user doesn't specify that the temporary files get stored in a temporary directory
Expand Down Expand Up @@ -198,7 +199,9 @@ read.SAScii.monetdb <-
tablename ,
" FROM '" ,
normalizePath( fn ) ,
"' NULL AS '' " ,
"' NULL AS " ,
paste0( "'" , na_strings , "'" ) ,
" " ,
if( try_best_effort ) " BEST EFFORT " ,
" FWF (" ,
paste0( w , collapse = ", " ) ,
Expand Down

0 comments on commit b8dae1e

Please sign in to comment.