In [1]:
source("../Data Generator.r")
library(randomForest)

randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.


In [14]:
var_re = 3
### training, validation and test set ###
set.seed(100)
n = 400
p = 400
imp_mod = c(1,4)
var_noise = 1
data = sim_time(n=n,p=p,imp_mod=imp_mod,var_noise=var_noise,a1=0,a2=0,var_re=var_re)

# test set (used for testing performance using optimal parameters)
set.seed(101)
n_test = 100
data_test = sim_time(n=n_test,p=p,imp_mod=imp_mod,var_noise=var_noise,a1=0,a2=0,var_re=var_re)

In [15]:
n_run = 50 # the number of times RF will run on the data set
n_top = 10 # the top n_top variables will be selected
# create empty data frame to save simulation results in
result_rf = matrix(0,n_run+1,p+1) # the last row is for average
result_rf = data.frame(result_rf)
names(result_rf)[p+1] = "error"
names(result_rf)[1:p] = paste("V",1:p,sep="")

In [16]:
mtry=200 # change mtry for each n

system.time({
for(Repeat in 1:n_run){
    set.seed(Repeat+32) # change seed each loop
    
    var = paste("V",1:p,sep="")
    Formula = as.formula(paste("y~",paste(var,collapse = "+")))
    rf <- randomForest(formula = Formula, data = data, mtry=mtry) 
    
    # error on the test set
    preds <- predict(rf, newdata=data_test)
    error = mean((data_test$y-preds)^2)


    # this is a quicker way to get the ranking (not juct choosing) of varibales 
    importance_order <- sort(rf$importance, decreasing = TRUE,index.return=TRUE) # sorts features by importance
    top_variables = importance_order$ix[1:n_top] # the ranking

    # If variable was selected as important, indicate with 1 (otherwise 0)
    for (i in 1:p){
      result_rf[Repeat,i] <- as.numeric(i %in% top_variables)
    }
    
    result_rf[Repeat,p+1] <- error

    flush.console()
    cat(Repeat,"\n")
}
})
result_rf[n_run+1,] = colMeans(result_rf[1:n_run,])
name = paste("rf_n",n,".csv",sep="")
write.csv(result_rf,file = name)

1 
2 
3 
4 
5 
6 
7 
8 
9 
10 
11 
12 
13 
14 
15 
16 
17 
18 
19 
20 
21 
22 
23 
24 
25 
26 
27 
28 
29 
30 
31 
32 
33 
34 
35 
36 
37 
38 
39 
40 
41 
42 
43 
44 
45 
46 
47 
48 
49 
50 


   user  system elapsed 
9565.61   40.22 9690.75 

In [18]:
result_rf

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V392,V393,V394,V395,V396,V397,V398,V399,V400,error
1,1,1,0.0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,0,0,45.79886
1,1,1,0.0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,0,0,44.7455
1,1,1,0.0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,0,0,44.79948
1,1,1,0.0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,0,0,44.93023
1,1,1,0.0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,0,0,44.9851
1,1,1,0.0,0,0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,45.53355
1,1,1,0.0,0,0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,44.67138
1,1,1,0.0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,0,0,44.97522
1,1,1,0.0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,0,0,45.1115
1,1,1,0.0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,0,0,45.15025
