# Libraries

In [1]:
library(tidyverse)
library(caret)
library(data.table)
library(stringr)
library(dplyr)
library(qs)

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

    lift



Attaching package: ‘data.table’


T

In [2]:
library(clustermq)
options(clustermq.scheduler = "sge", clustermq.template = "~/.clustermq.tmpl")

In [3]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/envs/r-ml/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] clustermq_0.8.95.1 qs_0.23.4          data.table_1.13.4  caret_6.0-86      
 [5] lattice_0.20-41    forcats_0.5.0      stringr_1.4.0      dplyr_1.0.2       
 [9] purrr_0.3.4        readr_1.4.0        tidyr_1.1.2        tibble_3.0.4      
[13] ggplot2_3.3.2      tidy

# Function

In [4]:
wf <- function(ix, data, meta, target, ntree){
    
    res <- list()
    # feature selection
    message('Boruta')
    bor <-  Boruta(x = data[ix,], y = target[ix])
    message('Rough boruta')
    bor <- TentativeRoughFix(x = bor)
    res$confirmed <- names(bor$finalDecision[bor$finalDecision == 'Confirmed']) 
    
    # select data
    message('Subset data')
    tmp <- unique(c(meta, res$confirmed))
    X_fs <- select(data, all_of(tmp) )
    X_fs <- mutate_if(X_fs, is.character, as.factor)
    
    # RF
    message('RF')
    set.seed(ix[1])
    rf_fs <- randomForest(x = X_fs[ix,], y = target[ix], ntree = ntree)
    pred <- predict(object = rf_fs, newdata = X_fs[-ix,])
    tmp <- confusionMatrix(data = pred, reference = target[-ix])
    res$rf_performance <- tmp$overall
    
    return(res)
}

# data

In [5]:
X <- readRDS('../data/X.RDS')
target <- readRDS('../data/target.RDS')

In [6]:
colnames(X) %>% head

In [7]:
mnames <- c('Age', 'BMI', 'Sex')

In [8]:
set.seed(0)
trainIx <- createDataPartition(y = target, times = 10, p = .7, list = TRUE)

# Train 

## CV

In [9]:
tmpl <- list(conda = "r-ml", cores = 3, job_time = '00:59:00', job_mem = '5G')

In [13]:
res <- Q(wf
  , ix = trainIx
  , const = list('data'= X, 'meta' = mnames, 'target' = target, 'ntree' = 500)
  , n_jobs= 10
  , pkgs=c('caret', 'randomForest', 'dplyr', 'Boruta')
  , log_worker=FALSE
  , template = tmpl
 )

Submitting 10 worker jobs (ID: cmq6890) ...

Running 10 calculations (4 objs/1 Mb common; 1 calls/chunk) ...


[---------------------------------------------------]   0% (1/10 wrk) eta:  ?s

[---------------------------------------------------]   0% (2/10 wrk) eta:  ?s

[---------------------------------------------------]   0% (3/10 wrk) eta:  ?s

[---------------------------------------------------]   0% (4/10 wrk) eta:  ?s

[---------------------------------------------------]   0% (5/10 wrk) eta:  ?s

[---------------------------------------------------]   0% (6/10 wrk) eta:  ?s

[---------------------------------------------------]   0% (7/10 wrk) eta:  ?s

[---------------------------------------------------]   0% (8/10 wrk) eta:  ?s

[---------------------------------------------------]   0% (9/10 wrk) eta:  ?s

[--------------------------------------------------]   0% (10/10 wrk) eta:  ?s

[====>---------------------------------------------]  10% (10/10 wrk) eta:  7m









 

In [14]:
saveRDS(res, '../tmp/Boruta.RDS')

In [55]:
#res <- readRDS('../tmp/Boruta.RDS')

In [15]:
rf_sum <- t(sapply(res, function(x){x$rf_performance}))

In [16]:
rf_sum %>% summary
sd(rf_sum[,1]) %>% round(digits = 4)
sd(rf_sum[,2]) %>% round(digits = 4)

    Accuracy          Kappa        AccuracyLower    AccuracyUpper   
 Min.   :0.7632   Min.   :0.5289   Min.   :0.5976   Min.   :0.8856  
 1st Qu.:0.8224   1st Qu.:0.6484   1st Qu.:0.6644   1st Qu.:0.9269  
 Median :0.8684   Median :0.7354   Median :0.7191   Median :0.9559  
 Mean   :0.8553   Mean   :0.7109   Mean   :0.7046   Mean   :0.9463  
 3rd Qu.:0.8882   3rd Qu.:0.7754   3rd Qu.:0.7437   3rd Qu.:0.9669  
 Max.   :0.9211   Max.   :0.8421   Max.   :0.7862   Max.   :0.9834  
  AccuracyNull    AccuracyPValue      McnemarPValue   
 Min.   :0.5263   Min.   :1.726e-07   Min.   :0.1306  
 1st Qu.:0.5263   1st Qu.:3.303e-06   1st Qu.:0.3711  
 Median :0.5263   Median :8.982e-06   Median :0.5610  
 Mean   :0.5263   Mean   :2.830e-04   Mean   :0.6215  
 3rd Qu.:0.5263   3rd Qu.:1.621e-04   3rd Qu.:1.0000  
 Max.   :0.5263   Max.   :2.352e-03   Max.   :1.0000  