# Preambule

In [1]:
library(tidyverse)
library(stringr)
library(caret)
library(data.table)
library(stringr)
library(dplyr)
library(randomForest)
library(qs)
library(parallel)
library(igraph)
library(ggraph)
library(clustermq)
library(inTrees)
library(RRF)
library(endoR)

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

    lift



Attaching package: ‘data.table’


T

In [2]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/envs/r-ml/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] endoR_0.1.0         RRF_1.9.1           inTrees_1.3        
 [4] clustermq_0.8.95.1  ggraph_2.0.4        igraph_1.2.6       
 [7] qs_0.23.4           randomForest_4.6-14 data.table_1.13.4  
[10] caret_6.0-86        lattice_0.20-41     forcats_0.5.0    

In [3]:
options(clustermq.scheduler = "sge", clustermq.template = "~/.clustermq.tmpl")

# Data

In [4]:
taxa <- qread('../data/taxa_table.qs')
pp <- qread('../data/tax_meta.qs')

In [5]:
# remove the MetaCyc pathways
to_rm <- seq(pp %>% colnames %>% str_which(pattern = '^[:lower:]\\_{1}') %>% max, ncol(pp), 1)
pp <- pp[,-to_rm]
pp %>% dim

In [6]:
# remove the metadata
meta <- c('dataset_name','Sample','age','gender','country','BMI','westernized')
X <- select(pp, -all_of(meta))

In [7]:
# keep only taxa with prevalance > 24%
tmp <- colnames(X)[which(colSums(X != 0) > nrow(X)/4)]
X <- select(X, all_of(tmp))
X %>% dim

In [8]:
# shuffle samples
set.seed(0)
X <- X[complete.cases(X),][sample(1:nrow(X)),]

# Make the target

In [9]:
# draw random variables for predicting groups
set.seed(1209)
var_ix <- sample(which(colSums(X != 0) > nrow(X)/2), 9, replace = FALSE)
var_n <- colnames(X)[var_ix]

In [10]:
var_n

In [11]:
nr <- nrow(X)
ng <- floor(nr/4)
# make groups
X <- as.data.table(X)[,'group':=c(rep('a',ng), rep('b', ng), rep('c', ng), rep('d', nr-3*ng))]

In [12]:
target <- data.frame('group'= X$group, 'tc'='1', stringsAsFactors = FALSE)

In [13]:
# for each group, make target according to random drawn taxa of var_n
target$tc[target$group == 'a'] <- ifelse( X$s_Marvinbryantia_sp900066075[target$group == 'a'] > 0 
                                         & X$g_Alistipes_A[target$group == 'a'] > 0 
                                         , '1', '-1')
target$tc[target$group == 'b'] <- ifelse( X$f_Bacteroidaceae[target$group == 'b'] > 10^-(1) 
                                         & X$g_Dialister[target$group == 'b'] > 10^-(2.5)
                                         , '1', '-1')
target$tc[target$group == 'c'] <- ifelse((X$s_Oscillibacter_sp001916835[target$group == 'c'] > 0 
                                         & X$s_Bacteroides_clarus[target$group == 'c'] > 0)
                                         | X$s_Faecalibacterium_prausnitzii_G[target$group == 'c'] >10^-2
                                         , '1', '-1')
target$tc[target$group == 'd'] <- ifelse( X$s_Lawsonibacter_sp000177015[target$group == 'd'] <= 10^-3.4 
                                         & X$f_Anaerovoracaceae[target$group == 'd'] > 0
                                         , '1', '-1')

In [14]:
table(target$tc, target$group)

    
       a   b   c   d
  -1 319 299 214 361
  1  217 237 322 178

In [15]:
# randomise group labels
groups <- c('a', 'b', 'c', 'd', 'e')
set.seed(0)
brnounou <- rbinom(n = length(X$group), size = 1,prob = 0.05)
for (i in 1:length(brnounou)){
    if (brnounou[i] == 1){
        set.seed(i)
        X$group[i] <- sample(groups[groups != X$group[i]], 1)
    }
}

In [16]:
table(target$tc, X$group)

    
       a   b   c   d   e
  -1 312 296 226 347  12
  1  219 232 318 175  10

# Train 

## data

In [17]:
target_c <- as.factor(target$tc)
X <- X[, 'group':= as.factor(group)]

In [18]:
# transform to dummy
dummies <- dummyVars(~ ., data = X )
dummies <- as.data.table(predict(dummies, newdata = X ))

In [19]:
colnames(dummies) <- colnames(dummies) %>% str_replace_all(pattern = '\\.', replacement ='')

## CV

In [24]:
tmpl <- list(conda = "r-ml", cores = 1, job_time = '00:59:00', job_mem = '1G')

In [25]:
wf <- function(ix, data, target, ntree = 500, gamma = 1){
    set.seed(ix[1])
    res <- list()
    # feature selection
    message('Feature selection')
    RF <- RRF(data[ix,], flagReg=0, as.factor(target[ix]))
    imp <-RF$importance[,"MeanDecreaseGini"]
    impRF <- (imp - min(imp))/(max(imp) - min(imp))# normalization
    coefReg <- (1-gamma) + gamma*impRF
    GRRF <- RRF(data[ix,], as.factor(target[ix]), flagReg=1, coefReg=coefReg)
    
    # select data
    message('Subset data')
    to_keep <- colnames(data)[GRRF$feaSet]
    X_fs <- select(data, all_of(to_keep))
    res$confirmed <- to_keep
    
    # RF
    message('RF')
    rf_fs <- randomForest(x = X_fs[ix,], y = target[ix], ntree = ntree)
    pred <- predict(object = rf_fs, newdata = X_fs[-ix, ])
    tmp <- confusionMatrix(data = pred, reference = target[-ix])
    res$rf_performance <- tmp$overall
    
    return(res)
}

In [26]:
gammaTuning <- function(trainIx, data, target, gamma = 1, ntree = 500){
    res <- lapply(trainIx, wf, data=data, target=target, gamma=gamma, ntree=ntree)
    return(res)
}

In [22]:
set.seed(0)
trainIx <- createDataPartition(y = target_c, times = 10, p = .7, list = TRUE)

In [23]:
gammas <- seq(0,1, by = 0.05)

In [33]:
res <- Q(gammaTuning
  , gamma = gammas
  , const = list('data'= dummies, 'target' = target_c, 'trainIx' = trainIx, 'ntree' = 500)
  , export = list('wf' = wf)
  , n_jobs= length(gammas)
  , pkgs=c('caret', 'randomForest', 'dplyr', 'RRF')
  , log_worker=FALSE
  , template = tmpl
 )
qsave(res, file = '../../tmp/Mtg_p005_gRRF_CV.qs')

Submitting 21 worker jobs (ID: cmq6799) ...

Running 21 calculations (5 objs/8.7 Mb common; 1 calls/chunk) ...


[---------------------------------------------------]   0% (1/21 wrk) eta:  ?s

[---------------------------------------------------]   0% (2/21 wrk) eta:  ?s

[---------------------------------------------------]   0% (3/21 wrk) eta:  ?s

[---------------------------------------------------]   0% (4/21 wrk) eta:  ?s

[---------------------------------------------------]   0% (5/21 wrk) eta:  ?s

[---------------------------------------------------]   0% (6/21 wrk) eta:  ?s

[---------------------------------------------------]   0% (7/21 wrk) eta:  ?s

[---------------------------------------------------]   0% (8/21 wrk) eta:  ?s

[---------------------------------------------------]   0% (9/21 wrk) eta:  ?s

[--------------------------------------------------]   0% (10/21 wrk) eta:  ?s

[--------------------------------------------------]   0% (11/21 wrk) eta:  ?s

[------

In [20]:
res <- qread('/ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/RF_coocc/RF_coocc/tmp/Mtg_p005_gRRF_CV.qs')

In [25]:
# average results across CV sets
all <- res %>% lapply(function(x){t(sapply(x, function(x){return(x$rf_performance)}))})
all <- t(all %>% sapply(function(x){return(c('meanAcc' = mean(x[,1]*100), 'sdAcc' = sd(x[,1]*100)
                                             , 'meanK' = mean(x[,2]), 'sdK' = sd(x[,2])))}))
rownames(all) <- gammas

In [26]:
# gamma = 0.45: best results across 10-folds CV
all %>% as.data.frame %>% arrange(desc(meanK), desc(meanAcc)) %>% 
head(3)

Unnamed: 0_level_0,meanAcc,sdAcc,meanK,sdK
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>
0.45,85.1944,2.362918,0.6969377,0.04909877
0.5,84.85226,2.393879,0.6910305,0.04903255
0.55,84.55677,3.060819,0.6845043,0.06331969


# final

In [27]:
gamma <- 0.45

In [28]:
set.seed(0)
message('Feature selection')
RF <- RRF(dummies, flagReg=0, as.factor(target_c))
imp <-RF$importance[,"MeanDecreaseGini"]
impRF <- (imp - min(imp))/(max(imp) - min(imp))# normalization
coefReg <- (1-gamma) + gamma*impRF
GRRF <- RRF(dummies, as.factor(target_c), flagReg=1, coefReg=coefReg)

Feature selection



In [29]:
# select data
to_keep <- colnames(dummies)[GRRF$feaSet]
X_fs <- select(dummies, all_of(to_keep))

In [30]:
length(to_keep)

In [31]:
# not all true variables were selected 
var_n %in% to_keep
var_n

In [32]:
# RF
set.seed(0)
rf_fs <- randomForest(x = X_fs, y = as.factor(target_c), ntree = 500)

In [33]:
rf_fs
cat('RF accuracy = ', 100*round((rf_fs$confusion[1,1] + rf_fs$confusion[2,2])/sum(rf_fs$confusion), digits = 4))


Call:
 randomForest(x = X_fs, y = as.factor(target_c), ntree = 500) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 4

        OOB estimate of  error rate: 6.66%
Confusion matrix:
     -1   1 class.error
-1 1145  48  0.04023470
1    95 859  0.09958071

RF accuracy =  93.33

In [34]:
gini <- rf_fs$importance %>% as.data.frame %>% arrange(desc(MeanDecreaseGini))
gini$feature <- rownames(gini)
rownames(gini) <- NULL
gini$feature <- factor(gini$feature, levels = gini$feature[order(-gini$MeanDecreaseGini)])

In [35]:
gini %>% head

Unnamed: 0_level_0,MeanDecreaseGini,feature
Unnamed: 0_level_1,<dbl>,<fct>
1,79.41305,f_Bacteroidaceae
2,72.01129,s_Lawsonibacter_sp000177015
3,71.3286,groupb
4,70.64042,s_Faecalibacterium_prausnitzii_G
5,66.98962,g_Alistipes_A
6,61.49104,f_Anaerovoracaceae


In [36]:
saveRDS(gini, 'Gini_main.RDS')

# 100 resamples

In [35]:
table(target_c)

target_c
  -1    1 
1193  954 

In [37]:
preclu <- preCluster(model = rf_fs, model_type = 'rf', sample_weight = c(0.55, 0.45), classPos = '1'
                , dummy_var = 'group'
                , discretize = TRUE, K = 3
                , times = 100, p = .5
                , ntree = 'all'
                , data = X_fs, target = target_c
                , seed = 1
                , in_parallel = TRUE, n_cores = 15)

Extract rules...

Discretise data

Discretise rules



In [38]:
qsave(preclu, 'Pre_bootstraps.qs')

In [39]:
tmpl <- list(conda = "r-ml", cores = 15, job_time = '24:00:00', job_mem = '3G')

In [40]:
rules <- Q(model2DE_cluster
  , partition = preclu$partitions
  , export=list(data = preclu$data
                , target = target_c
                , exec = preclu$exec
                , classPos = '1'
                , prune = TRUE, maxDecay = 0.05, typeDecay = 2 
                , filter = TRUE
                , in_parallel = TRUE, n_cores = 15
               )
  , n_jobs= 100
  , pkgs=c('data.table', 'parallel', 'caret', 'stringr', 'scales', 'dplyr', 'inTrees', 'babR')
  , log_worker=FALSE
  , template = tmpl
 )

Submitting 100 worker jobs (ID: cmq6048) ...

Running 100 calculations (10 objs/165.7 Mb common; 1 calls/chunk) ...


[--------------------------------------------------]   0% (1/100 wrk) eta:  ?s

[--------------------------------------------------]   0% (2/100 wrk) eta:  ?s

[--------------------------------------------------]   0% (3/100 wrk) eta:  ?s

[--------------------------------------------------]   0% (4/100 wrk) eta:  ?s

[--------------------------------------------------]   0% (5/100 wrk) eta:  ?s

[--------------------------------------------------]   0% (6/100 wrk) eta:  ?s

[--------------------------------------------------]   0% (7/100 wrk) eta:  ?s

[--------------------------------------------------]   0% (8/100 wrk) eta:  ?s

[--------------------------------------------------]   0% (9/100 wrk) eta:  ?s

[-------------------------------------------------]   0% (10/100 wrk) eta:  ?s

[-------------------------------------------------]   0% (11/100 wrk) eta:  ?s

[-









































































































































                                                                              

Master: [2130.4s 4.4% CPU]; Worker: [avg 5.0% CPU, max 1501.8 Mb]



In [41]:
qsave(rules, 'All_bootstraps.qs')