# Random Forest benchmarks in R

In [1]:
source('utils/iRF_benchmarks_Rlib.R')

randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.


## Load data

In [2]:
#load breast cancer data

raw_data <- read.delim("http://ftp.cs.wisc.edu/math-prog/cpo-dataset/machine-learn/cancer/WDBC/WDBC.dat", 
                       sep = ",", header = FALSE)
raw_data <- raw_data[, -1] # remove first column, its just an identifier

# name the columns
names(raw_data)[2:dim(raw_data)[2]] <- paste("x_", 0:29, sep="") 
names(raw_data)[1] <- 'y'

head(raw_data)

features <- raw_data[,-1]
responses <- raw_data[,1]

y,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,⋯,x_20,x_21,x_22,x_23,x_24,x_25,x_26,x_27,x_28,x_29
M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,⋯,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,⋯,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,⋯,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,⋯,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,⋯,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,⋯,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [2]:
# load splicing data

splice.data <- read.table('Y_X_splicing.txt')
idcs.high <- splice.data$psi > 0.7
idcs.low <- splice.data$psi < 0.3
splice.data$y <- rep(0, nrow(splice.data))
splice.data$y[idcs.high] <- 1
splice.data <- splice.data[idcs.high | idcs.low,]

n.full <- nrow(splice.data)
idcs.train <- sample(1:n.full, floor(n.full * 0.5))
idcs.test <- (1:n.full)[-idcs.train]

x <- splice.data[,!colnames(splice.data) %in% c('psi', 'y')]
x <- x[,1:270]
features <- as.matrix(x)
responses <- as.factor(splice.data$y)

## Call function to run benchmarks

In [3]:
train_split_propn = 0.8
n_estimators = 20
n_trials = 10

benchmarks <- RF_benchmarks(features, responses, train_split_propn = train_split_propn, \
                            n_estimators = n_estimators, n_trials=n_trials)

## Lets look at the results

### Relevant parameters

In [4]:
cat('Dimensions of full dataset (#samples , # features): ', dim(features), '\n')
cat('Number of training samples: ', round(dim(features)[1] * train_split_propn), '\n')
cat('Number of test samples: ', round(dim(features)[1]*(1-train_split_propn)),  '\n')
cat('number of trees in the random forest: ', n_estimators)

Dimensions of full dataset (#samples , # features):  23823 270 
Number of training samples:  19058 
Number of test samples:  4765 
number of trees in the random forest:  20

### Timing and some accuracy scores across trials

In [5]:
print('mean and std of various metrics across trials')
benchmarks$metrics_summary

[1] "mean and std of various metrics across trials"


### Stability of feature importances across trials

In [6]:
print('top five feature importances across trials')

for(i in 1:n_trials){ 
    # sort by feature importance
    x <- benchmarks$feature_importance[[i]]
    ord <- order(x, decreasing = TRUE)
    print(ord[0:5] - 1) # -1 to compare with python output
    #x <- x[ord, ]
    #print(x[0:5])    
    
    }


[1] "top five feature importances across trials"
[1] 255 267 266 259 261
[1] 255 259 267 266 263
[1] 266 255 267 259 263
[1] 267 255 259 263 266
[1] 255 266 267 259 263
[1] 267 255 259 266 261
[1] 267 255 266 259 261
[1] 255 267 266 259 263
[1] 267 255 266 263 259
[1] 266 255 267 259 263
