# Preambule

In [1]:
library(tidyverse)
library(stringr)
library(caret)
library(data.table)
library(stringr)
library(dplyr)
library(randomForest)
library(qs)
library(parallel)

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

 

In [2]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/envs/r-ml/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] qs_0.23.4           randomForest_4.6-14 data.table_1.13.4  
 [4] caret_6.0-86        lattice_0.20-41     forcats_0.5.0      
 [7] stringr_1.4.0       dplyr_1.0.2         purrr_0.3.4        
[10] readr_1.4.0         tidyr_1.1.2         tibble_3.0.4     

# Data

In [3]:
taxa <- qread('../data/taxa_table.qs')
pp <- qread('../data/tax_meta.qs')

In [4]:
# remove the MetaCyc pathways
to_rm <- seq(pp %>% colnames %>% str_which(pattern = '^[:lower:]\\_{1}') %>% max, ncol(pp), 1)
pp <- pp[,-to_rm]
pp %>% dim

In [5]:
# remove the metadata
meta <- c('dataset_name','Sample','age','gender','country','BMI','westernized')
X <- select(pp, -all_of(meta))

In [6]:
# keep only taxa with prevalance > 24%
tmp <- colnames(X)[which(colSums(X != 0) > nrow(X)/4)]
X <- select(X, all_of(tmp))
X %>% dim

In [7]:
# shuffle samples
set.seed(0)
X <- X[complete.cases(X),][sample(1:nrow(X)),]

# Make the target

In [8]:
# draw random variables for predicting groups
set.seed(1209)
var_ix <- sample(which(colSums(X != 0) > nrow(X)/2), 9, replace = FALSE)
var_n <- colnames(X)[var_ix]

In [9]:
var_n

In [10]:
nr <- nrow(X)
ng <- floor(nr/4)
# make groups
X <- as.data.table(X)[,'group':=c(rep('a',ng), rep('b', ng), rep('c', ng), rep('d', nr-3*ng))]

In [11]:
target <- data.frame('group'= X$group, 'tc'='1', stringsAsFactors = FALSE)

In [12]:
# for each group, make target according to random drawn taxa of var_n
target$tc[target$group == 'a'] <- ifelse( X$s_Marvinbryantia_sp900066075[target$group == 'a'] > 0 
                                         & X$g_Alistipes_A[target$group == 'a'] > 0 
                                         , '1', '-1')
target$tc[target$group == 'b'] <- ifelse( X$f_Bacteroidaceae[target$group == 'b'] > 10^-(1) 
                                         & X$g_Dialister[target$group == 'b'] > 10^-(2.5)
                                         , '1', '-1')
target$tc[target$group == 'c'] <- ifelse((X$s_Oscillibacter_sp001916835[target$group == 'c'] > 0 
                                         & X$s_Bacteroides_clarus[target$group == 'c'] > 0)
                                         | X$s_Faecalibacterium_prausnitzii_G[target$group == 'c'] >10^-2
                                         , '1', '-1')
target$tc[target$group == 'd'] <- ifelse( X$s_Lawsonibacter_sp000177015[target$group == 'd'] <= 10^-3.4 
                                         & X$f_Anaerovoracaceae[target$group == 'd'] > 0
                                         , '1', '-1')

In [13]:
table(target$tc, target$group)

    
       a   b   c   d
  -1 319 299 214 361
  1  217 237 322 178

In [14]:
# randomise group labels
groups <- c('a', 'b', 'c', 'd', 'e')
set.seed(0)
brnounou <- rbinom(n = length(X$group), size = 1,prob = 0.05)
for (i in 1:length(brnounou)){
    if (brnounou[i] == 1){
        set.seed(i)
        X$group[i] <- sample(groups[groups != X$group[i]], 1)
    }
}

In [15]:
table(target$tc, X$group)

    
       a   b   c   d   e
  -1 312 296 226 347  12
  1  219 232 318 175  10

# Train 

## data

In [19]:
target_c <- as.factor(target$tc)
X <- X[, 'group':= as.factor(group)]

In [20]:
# transform to dummy
dummies <- dummyVars(~ ., data = X )
dummies <- as.data.table(predict(dummies, newdata = X ))

In [21]:
colnames(dummies) <- colnames(dummies) %>% str_replace_all(pattern = '\\.', replacement ='')

In [22]:
set.seed(0)
trainIx <- createDataPartition(y = target_c, times = 10, p = .7, list = TRUE)

## go!

In [16]:
library(clustermq)
options(clustermq.scheduler = "sge", clustermq.template = "~/.clustermq.tmpl")
tmpl <- list(conda = "r-ml", cores = 1, job_time = '00:59:00', job_mem = '1G')

In [17]:
wf <- function(ix, data, target, ntree = 500){
    set.seed(ix[1])
    res <- list()
    # RF
    message('RF')
    rf <- randomForest(x = data[ix,], y = target[ix], ntree = ntree)
    pred <- predict(object = rf, newdata = data[-ix, ])
    tmp <- confusionMatrix(data = pred, reference = target[-ix])
    res$rf_performance <- tmp$overall
    
    return(res)
}

In [31]:
res <- Q(wf
  , ix = trainIx
  , const = list('data'= dummies, 'target' = target_c, 'ntree' = 500)
  , n_jobs= 8
  , pkgs=c('caret', 'randomForest', 'dplyr', 'RRF')
  , log_worker=FALSE
  , template = tmpl
 )

Submitting 8 worker jobs (ID: cmq6799) ...

Running 10 calculations (3 objs/8.7 Mb common; 1 calls/chunk) ...


[----------------------------------------------------]   0% (1/8 wrk) eta:  ?s

[----------------------------------------------------]   0% (2/8 wrk) eta:  ?s

[----------------------------------------------------]   0% (3/8 wrk) eta:  ?s

[----------------------------------------------------]   0% (4/8 wrk) eta:  ?s

[----------------------------------------------------]   0% (5/8 wrk) eta:  ?s

[----------------------------------------------------]   0% (6/8 wrk) eta:  ?s

[----------------------------------------------------]   0% (7/8 wrk) eta:  ?s

[----------------------------------------------------]   0% (8/8 wrk) eta:  ?s

[====>-----------------------------------------------]  10% (8/8 wrk) eta:  9m









                                                                              

Master: [108.4s 2.2% CPU]; Worker: [avg 99.8% CPU, max 531.3 Mb]



In [38]:
all <- t(res %>% sapply(function(x){return(x$rf_performance)})) %>% 
            as.data.frame

In [41]:
all %>% select(Accuracy, Kappa) %>% summary
all %>% select(Accuracy, Kappa) %>% sapply(function(x){round(sd(x), digits = 4)})

    Accuracy          Kappa       
 Min.   :0.6781   Min.   :0.3360  
 1st Qu.:0.6804   1st Qu.:0.3437  
 Median :0.6874   Median :0.3541  
 Mean   :0.6869   Mean   :0.3558  
 3rd Qu.:0.6874   3rd Qu.:0.3602  
 Max.   :0.7030   Max.   :0.3860  