# Preambule

In [1]:
library(tidyverse)
library(stringr)
library(caret)
library(data.table)
library(stringr)
library(dplyr)
library(randomForest)
library(qs)
library(parallel)
library(Boruta)
library(clustermq)

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

 

In [2]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/envs/r-ml/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] clustermq_0.8.95.1  Boruta_7.0.0        qs_0.23.4          
 [4] randomForest_4.6-14 data.table_1.13.4   caret_6.0-86       
 [7] lattice_0.20-41     forcats_0.5.0       stringr_1.4.0      
[10] dplyr_1.0.2         purrr_0.3.4         readr_1.4.0      

# Data

In [3]:
taxa <- qread('../data/taxa_table.qs')
pp <- qread('../data/tax_meta.qs')

In [5]:
# remove the MetaCyc pathways
to_rm <- seq(pp %>% colnames %>% str_which(pattern = '^[:lower:]\\_{1}') %>% max, ncol(pp), 1)
pp <- pp[,-to_rm]
pp %>% dim

In [6]:
# remove the metadata
meta <- c('dataset_name','Sample','age','gender','country','BMI','westernized')
X <- select(pp, -all_of(meta))

In [7]:
# keep only taxa with prevalance > 24%
tmp <- colnames(X)[which(colSums(X != 0) > nrow(X)/4)]
X <- select(X, all_of(tmp))
X %>% dim

In [8]:
# shuffle samples
set.seed(0)
X <- X[complete.cases(X),][sample(1:nrow(X)),]

# Make the target

In [9]:
# draw random variables for predicting groups
set.seed(1209)
var_ix <- sample(which(colSums(X != 0) > nrow(X)/2), 9, replace = FALSE)
var_n <- colnames(X)[var_ix]

In [10]:
var_n

In [11]:
nr <- nrow(X)
ng <- floor(nr/4)
# make groups
X <- as.data.table(X)[,'group':=c(rep('a',ng), rep('b', ng), rep('c', ng), rep('d', nr-3*ng))]

In [12]:
target <- data.frame('group'= X$group, 'tc'='1', stringsAsFactors = FALSE)

In [13]:
# for each group, make target according to random drawn taxa of var_n
target$tc[target$group == 'a'] <- ifelse( X$s_Marvinbryantia_sp900066075[target$group == 'a'] > 0 
                                         & X$g_Alistipes_A[target$group == 'a'] > 0 
                                         , '1', '-1')
target$tc[target$group == 'b'] <- ifelse( X$f_Bacteroidaceae[target$group == 'b'] > 10^-(1) 
                                         & X$g_Dialister[target$group == 'b'] > 10^-(2.5)
                                         , '1', '-1')
target$tc[target$group == 'c'] <- ifelse((X$s_Oscillibacter_sp001916835[target$group == 'c'] > 0 
                                         & X$s_Bacteroides_clarus[target$group == 'c'] > 0)
                                         | X$s_Faecalibacterium_prausnitzii_G[target$group == 'c'] >10^-2
                                         , '1', '-1')
target$tc[target$group == 'd'] <- ifelse( X$s_Lawsonibacter_sp000177015[target$group == 'd'] <= 10^-3.4 
                                         & X$f_Anaerovoracaceae[target$group == 'd'] > 0
                                         , '1', '-1')

In [14]:
table(target$tc, target$group)

    
       a   b   c   d
  -1 319 299 214 361
  1  217 237 322 178

In [15]:
# randomise group labels
groups <- c('a', 'b', 'c', 'd', 'e')
set.seed(0)
brnounou <- rbinom(n = length(X$group), size = 1,prob = 0.05)
for (i in 1:length(brnounou)){
    if (brnounou[i] == 1){
        set.seed(i)
        X$group[i] <- sample(groups[groups != X$group[i]], 1)
    }
}

In [16]:
table(target$tc, X$group)

    
       a   b   c   d   e
  -1 312 296 226 347  12
  1  219 232 318 175  10

# Train 

## data

In [18]:
target_c <- as.factor(target$tc)
X <- X[, 'group':= as.factor(group)]

In [19]:
# transform to dummy
dummies <- dummyVars(~ ., data = X )
dummies <- as.data.table(predict(dummies, newdata = X ))

In [20]:
colnames(dummies) <- colnames(dummies) %>% str_replace_all(pattern = '\\.', replacement ='')

## CV

In [21]:
options(clustermq.scheduler = "sge", clustermq.template = "~/.clustermq.tmpl")
tmpl <- list(conda = "r-ml", cores = 5, job_time = '00:59:00', job_mem = '5G')

In [22]:
wf <- function(ix, data, target, ntree = 100){
    
    set.seed(ix[1])
    res <- list()
    # feature selection
    message('Boruta')
    bor <-  Boruta(x = data[ix,], y = target[ix])
    bor <- TentativeRoughFix(x = bor)
    res$confirmed <- names(bor$finalDecision[bor$finalDecision == 'Confirmed']) 
    
    # select data
    message('Subset data')
    to_keep <- res$confirmed %>% unique
    X_fs <- select(data, all_of(to_keep))
    
    # RF
    message('RF')
    rf_fs <- randomForest(x = X_fs[ix,], y = target[ix], ntree = ntree)
    pred <- predict(object = rf_fs, newdata = X_fs[-ix, ])
    tmp <- confusionMatrix(data = pred, reference = target[-ix])
    res$rf_performance <- tmp$overall
    
    return(res)
}

In [23]:
set.seed(0)
trainIx <- createDataPartition(y = target_c, times = 10, p = .7, list = TRUE)

In [29]:
res <- Q(wf
  , ix = trainIx
  , const = list('data'= dummies, 'target' = target_c, 'ntree' = 500)
  , n_jobs= 9
  , pkgs=c('caret', 'randomForest', 'dplyr', 'Boruta')
  , log_worker=FALSE
  , template = tmpl
 )

Submitting 9 worker jobs (ID: cmq9742) ...

Running 10 calculations (3 objs/8.7 Mb common; 1 calls/chunk) ...


[----------------------------------------------------]   0% (1/9 wrk) eta:  ?s

[----------------------------------------------------]   0% (2/9 wrk) eta:  ?s

[----------------------------------------------------]   0% (3/9 wrk) eta:  ?s

[----------------------------------------------------]   0% (4/9 wrk) eta:  ?s

[----------------------------------------------------]   0% (5/9 wrk) eta:  ?s

[----------------------------------------------------]   0% (6/9 wrk) eta:  ?s

[----------------------------------------------------]   0% (7/9 wrk) eta:  ?s

[----------------------------------------------------]   0% (8/9 wrk) eta:  ?s

[----------------------------------------------------]   0% (9/9 wrk) eta:  ?s

[====>-----------------------------------------------]  10% (9/9 wrk) eta:  1h









                                                                              



In [31]:
rf_sum <- t(sapply(res, function(x){x$rf_performance}))

In [32]:
rf_sum %>% summary
round(sd(rf_sum[,1]*100), digits=2)
round(sd(rf_sum[,2]), digits=2)

    Accuracy          Kappa        AccuracyLower    AccuracyUpper   
 Min.   :0.7589   Min.   :0.5052   Min.   :0.7240   Min.   :0.7915  
 1st Qu.:0.7652   1st Qu.:0.5194   1st Qu.:0.7304   1st Qu.:0.7974  
 Median :0.7714   Median :0.5306   Median :0.7369   Median :0.8033  
 Mean   :0.7762   Mean   :0.5406   Mean   :0.7420   Mean   :0.8078  
 3rd Qu.:0.7858   3rd Qu.:0.5593   3rd Qu.:0.7520   3rd Qu.:0.8169  
 Max.   :0.8009   Max.   :0.5941   Max.   :0.7679   Max.   :0.8311  
  AccuracyNull    AccuracyPValue      McnemarPValue      
 Min.   :0.5552   Min.   :0.000e+00   Min.   :9.040e-06  
 1st Qu.:0.5552   1st Qu.:1.000e-33   1st Qu.:1.354e-04  
 Median :0.5552   Median :4.096e-30   Median :1.323e-03  
 Mean   :0.5552   Mean   :7.176e-28   Mean   :1.676e-02  
 3rd Qu.:0.5552   3rd Qu.:2.648e-28   3rd Qu.:4.969e-03  
 Max.   :0.5552   Max.   :5.903e-27   Max.   :9.308e-02  

## final

In [33]:
set.seed(0)
bor <-  Boruta(x = dummies, y = as.factor(target_c))
bor <- TentativeRoughFix(x = bor)

In [34]:
# select data
to_keep <- names(bor$finalDecision[bor$finalDecision == 'Confirmed']) 
X_fs <- select(dummies, all_of(to_keep))
length(to_keep)

In [35]:
var_n %in% to_keep
var_n

In [36]:
# RF
set.seed(0)
rf_fs <- randomForest(x = X_fs, y = as.factor(target_c), ntree = 500)

In [37]:
rf_fs
cat('RF accuracy = ', 100*round((rf_fs$confusion[1,1] + rf_fs$confusion[2,2])/sum(rf_fs$confusion), digits = 4))


Call:
 randomForest(x = X_fs, y = as.factor(target_c), ntree = 500) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 9

        OOB estimate of  error rate: 20.91%
Confusion matrix:
     -1   1 class.error
-1 1032 161   0.1349539
1   288 666   0.3018868

RF accuracy =  79.07