SAS to R guide

Reading instructions:

under each subheading, the first code block is SAS and the second code block is the R translation;
ctrl+F is your friend.

`data` step

Save data to disk

data save_lib.save_ds;
    set in_lib.in_ds;
run;

# Here save_lib is a string representing a directory. '.rda' is 
# sometimes used instead of '.RData'
save(in_ds, file=file.path(save_lib, "save_ds.RData"))
# To load in_ds back into the environment
load(file.path(save_lib, "save_ds.RData"))

Concatenate datasets

data concatenated;
    set mydata1 mydata2;
run;

# Note that if a factor variable in the two dataframes has different
# levels, then rbind will take the union of these levels
# while bind_rows will coerce the variable to string.
concatenated = rbind(mydata1, mydata2)
concatenated = dplyr::bind_rows(mydata1, mydata2)

Filter data or 'where' statement

data out_ds;
    set in_ds;
    where gender = 'M' and age >= 18;
run;

out_ds = in_ds[in_ds$gender == 'M' & in_ds$age >= 18, ]
# or with dplyr
out_ds = dplyr::filter(in_ds, gender == 'M', age >= 18)

`proc freq`

proc freq data=mydata;
    tables myvar / nocol nopercent nocum;
run;

table(mydata$myvar)

Sort by frequency

proc freq order=freq data=mydata;
	tables myvar / nocol nopercent nocum;
run;

sort(table(mydata$myvar), decreasing=TRUE)

With missing

proc freq data=mydata;
    tables myvar / nocol nopercent nocum missing;
run;

# only displays NA count if positive, to always include use "always"
table(mydata$myvar, useNA="ifany")

Percentages, missing & sorted by frequency

proc freq data=mydata;
    tables myvar / nocol nopercent nocum missing;
run;

# no concise way to combine counts w/ percentages in core R
counts = sort(table(mydata$myvar, useNA="ifany"), decreasing=True)
counts
prop.table(counts)

# we can do better with dplyr
library(dplyr)
pfreq = function(...) {
    group_by(...) %>%
        summarise(n=n()) %>%
        mutate(perc=paste0(round(100 * n/sum(n), 1), "%")) %>%
        arrange(desc(n))
}
pfreq(mydata, myvar)

`proc means`

proc means data=mydata min p25 median mean p75 max;
    var myvar;
run;

summary(mydata$myvar)

More percentiles

proc means data=mydata n mean std min max p1 p5 p10 p25 median p75 p90 p95 p99;
	var myvar;
run;

quantile(mydata$myvar, c(0, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1))

`proc sort`

proc sort data=in_data out=out_data; by myvar1 descending myvar2; run;

out_data = in_data[order(in_data$myvar1, -in_data$myvar2), ]
# or with dplyr
out_data = dplyr::arrange(in_data, myvar1, desc(myvar2))

Find duplicated rows

proc sort data=mydata NODUPKEY out=_null_ dupout=dups; by _all_; run;

dups = mydata[duplicated(mydata), ] 
# if you want to count duplicated rows: sum(duplicated(mydata))

Find duplicated values

proc sort data=mydata NODUPKEY out=_null_ dupout=dups; by myvar; run;

dups = mydata$myvar[duplicated(mydata$myvar)]

`proc print`

proc print data=mydata (obs=6);
run;

head(mydata)

`proc contents`

proc contents data=mydata;
run;

str(mydata)

Save output

proc contents noprint data=mydata out=contents;
run;

contents = str(mydata)

`proc format`

Transform values according to a format

proc format;
	value agefmt
	15-<21  = "15-20"
	21-<25  = "21-24"
	25-high = "25+"
	other   = " "
	;
run;

data out_ds;
	set in_ds;
	age_grouped=put(age, agefmt.);
run;

out_ds$age_f = cut(out_ds$age, 
                   breaks=c(15, 21, 25, Inf),
                   right=FALSE,
                   labels=c("15-20", "21-24", "25+"), 
                   ordered_result=TRUE)

Misc

Number of rows in a datastep

* haha nice try. Try this for size: http://www2.sas.com/proceedings/sugi26/p095-26.pdf;

nrow(mydata)  # or NROW(mydata)
dim(mydata)   # lists the number of rows and columns in a dataframe

Capping a number

cap = function(x, xmin=-Inf, xmax=Inf) {
	# x = value to be capped (numeric, vectorised)
	# xmin = lower bound to be imposed on x (numeric)
	# xmax = upper bound to be imposed on x (numeric)
	pmax(xmin, pmin(x, xmax))
}
mydata$number_capped = cap(mydata$number, 10, 200000)

Linear models

fit = mgcv::gam(response ~ predictor, data=in_ds)

Name		Name	Last commit message	Last commit date
Latest commit History 31 Commits
README.md		README.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

SAS to R guide

`data` step

Save data to disk

Concatenate datasets

Filter data or 'where' statement

`proc freq`

Sort by frequency

With missing

Percentages, missing & sorted by frequency

`proc means`

More percentiles

`proc sort`

Find duplicated rows

Find duplicated values

`proc print`

`proc contents`

Save output

`proc format`

Transform values according to a format

Misc

Number of rows in a datastep

Capping a number

Linear models

About

Releases

Packages

Contributors 2

asnr/sas-to-r

Folders and files

Latest commit

History

Repository files navigation

SAS to R guide

data step

Save data to disk

Concatenate datasets

Filter data or 'where' statement

proc freq

Sort by frequency

With missing

Percentages, missing & sorted by frequency

proc means

More percentiles

proc sort

Find duplicated rows

Find duplicated values

proc print

proc contents

Save output

proc format

Transform values according to a format

Misc

Number of rows in a datastep

Capping a number

Linear models

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Contributors 2

`data` step

`proc freq`

`proc means`

`proc sort`

`proc print`

`proc contents`

`proc format`

Packages