Reading instructions:
- under each subheading, the first code block is SAS and the second code block is the R translation;
- ctrl+F is your friend.
data save_lib.save_ds;
set in_lib.in_ds;
run;
# Here save_lib is a string representing a directory. '.rda' is
# sometimes used instead of '.RData'
save(in_ds, file=file.path(save_lib, "save_ds.RData"))
# To load in_ds back into the environment
load(file.path(save_lib, "save_ds.RData"))
data concatenated;
set mydata1 mydata2;
run;
# Note that if a factor variable in the two dataframes has different
# levels, then rbind will take the union of these levels
# while bind_rows will coerce the variable to string.
concatenated = rbind(mydata1, mydata2)
concatenated = dplyr::bind_rows(mydata1, mydata2)
data out_ds;
set in_ds;
where gender = 'M' and age >= 18;
run;
out_ds = in_ds[in_ds$gender == 'M' & in_ds$age >= 18, ]
# or with dplyr
out_ds = dplyr::filter(in_ds, gender == 'M', age >= 18)
proc freq data=mydata;
tables myvar / nocol nopercent nocum;
run;
table(mydata$myvar)
proc freq order=freq data=mydata;
tables myvar / nocol nopercent nocum;
run;
sort(table(mydata$myvar), decreasing=TRUE)
proc freq data=mydata;
tables myvar / nocol nopercent nocum missing;
run;
# only displays NA count if positive, to always include use "always"
table(mydata$myvar, useNA="ifany")
proc freq data=mydata;
tables myvar / nocol nopercent nocum missing;
run;
# no concise way to combine counts w/ percentages in core R
counts = sort(table(mydata$myvar, useNA="ifany"), decreasing=True)
counts
prop.table(counts)
# we can do better with dplyr
library(dplyr)
pfreq = function(...) {
group_by(...) %>%
summarise(n=n()) %>%
mutate(perc=paste0(round(100 * n/sum(n), 1), "%")) %>%
arrange(desc(n))
}
pfreq(mydata, myvar)
proc means data=mydata min p25 median mean p75 max;
var myvar;
run;
summary(mydata$myvar)
proc means data=mydata n mean std min max p1 p5 p10 p25 median p75 p90 p95 p99;
var myvar;
run;
quantile(mydata$myvar, c(0, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1))
proc sort data=in_data out=out_data; by myvar1 descending myvar2; run;
out_data = in_data[order(in_data$myvar1, -in_data$myvar2), ]
# or with dplyr
out_data = dplyr::arrange(in_data, myvar1, desc(myvar2))
proc sort data=mydata NODUPKEY out=_null_ dupout=dups; by _all_; run;
dups = mydata[duplicated(mydata), ]
# if you want to count duplicated rows: sum(duplicated(mydata))
proc sort data=mydata NODUPKEY out=_null_ dupout=dups; by myvar; run;
dups = mydata$myvar[duplicated(mydata$myvar)]
proc print data=mydata (obs=6);
run;
head(mydata)
proc contents data=mydata;
run;
str(mydata)
proc contents noprint data=mydata out=contents;
run;
contents = str(mydata)
proc format;
value agefmt
15-<21 = "15-20"
21-<25 = "21-24"
25-high = "25+"
other = " "
;
run;
data out_ds;
set in_ds;
age_grouped=put(age, agefmt.);
run;
out_ds$age_f = cut(out_ds$age,
breaks=c(15, 21, 25, Inf),
right=FALSE,
labels=c("15-20", "21-24", "25+"),
ordered_result=TRUE)
* haha nice try. Try this for size: http://www2.sas.com/proceedings/sugi26/p095-26.pdf;
nrow(mydata) # or NROW(mydata)
dim(mydata) # lists the number of rows and columns in a dataframe
cap = function(x, xmin=-Inf, xmax=Inf) {
# x = value to be capped (numeric, vectorised)
# xmin = lower bound to be imposed on x (numeric)
# xmax = upper bound to be imposed on x (numeric)
pmax(xmin, pmin(x, xmax))
}
mydata$number_capped = cap(mydata$number, 10, 200000)
fit = mgcv::gam(response ~ predictor, data=in_ds)