# Preambule

In [1]:
library(tidyverse)
library(stringr)
library(caret)
library(dplyr)
library('qs')
library(ranger)
library(parallel)
library(data.table)

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

    lift


qs v0.23.4.


Attaching package: ‘da

In [2]:
library(clustermq)
options(clustermq.scheduler = "sge", clustermq.template = "~/.clustermq.tmpl")

In [3]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/envs/r-ml/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] clustermq_0.8.95.1 data.table_1.13.4  ranger_0.12.1      qs_0.23.4         
 [5] caret_6.0-86       lattice_0.20-41    forcats_0.5.0      stringr_1.4.0     
 [9] dplyr_1.0.2        purrr_0.3.4        readr_1.4.0        tidyr_1.1.2       
[13] tibble_3.

# Functions

In [4]:
file.sources <- list.files('../scripts/', pattern = '*.R', full.names=TRUE, ignore.case=TRUE)
for (f in file.sources) {
    source(f)
}
rm('file.sources', 'f')

In [5]:
noneRanger_wcase <- function (ix, data, meta, target, num.trees = 500) 
{
    set.seed(ix[1])
    res <- list()
    data <- mutate_if(data, is.character, as.factor)
    
    message("RF")
    class_weights <- round(sum(target == levels(target)[1])/length(target), 
        digits = 2)
    class_weights <- c(1-class_weights, class_weights)
    case_weights <- ifelse(target == levels(target)[1], class_weights[1], class_weights[2])   
    
    set.seed(ix[1])
    rf_fs <- ranger(x = data[ix, ], y = target[ix], case.weights = case_weights[ix], 
        num.trees = num.trees, importance = "impurity")
    res$rf_model <- rf_fs
    if (length(ix) != nrow(data)) {
        pred <- predict(rf_fs, data = data[-ix, ])
        tmp <- confusionMatrix(data = pred$predictions, reference = target[-ix])
        res$rf_performance <- tmp$overall
    }
    return(res)
}

# Data

In [6]:
meta <- qread('../data/meta_pp.qs') %>% select(-all_of(c('age', 'BMI', 'genderfemale')))
tax <- qread('../data/taxa_nometh.qs') 
pth <- qread('../data/pathways_nometh.qs')
targets <- qread('../data/targets_meth.qs')

In [7]:
pp <- left_join(meta, tax, by = 'Sample') %>% left_join(pth, by = 'Sample') %>% 
            left_join(select(targets, Sample, Mtbc), by = 'Sample') 
target <- as.factor(pp$Mtbc)
pp <- pp %>% select(-Sample, -Mtbc)

In [8]:
dim(pp)

In [9]:
set.seed(0)
trainIx <- createDataPartition(y = target, times = 10, p = .7, list = TRUE)

In [10]:
meta_names <- c(str_subset(colnames(pp), pattern = 'dataset'), 'number_reads')

In [11]:
meta_names

# CV

In [12]:
tmpl <- list(conda = "r-ml", cores = 5, job_time = '00:59:00', job_mem = '5G')

In [13]:
# Do not run
#res <- Q(noneRanger_wcase
#  , ix = trainIx
#  , const = list('data'= pp, 'target' = target, num.trees = 250
#                 )
#  , n_jobs= length(trainIx)
#  , pkgs=c('caret', 'ranger', 'dplyr')
#  , log_worker=FALSE
#  , template = tmpl
# )

In [14]:
#qsave(res, '../tmp/none_ranger_nt250.qs')

In [15]:
#res500 <- Q(noneRanger_wcase
#  , ix = trainIx
#  , const = list('data'= pp, 'target' = target, num.trees = 500)
#  , n_jobs= length(trainIx)
#  , pkgs=c('caret', 'ranger', 'dplyr')
#  , log_worker=FALSE
#  , template = tmpl
# )

In [16]:
# qsave(res500, '../tmp/none_ranger_nt500.qs')

# res

In [18]:
res250 <- qread('../tmp/none_ranger_nt250.qs')
res500 <- qread('../tmp/none_ranger_nt500.qs')

In [19]:
all <- lapply(res250, function(x){x$rf_performance}) %>% 
            do.call(what = rbind) %>% as.data.frame %>% add_column(ntrees = 250)
tmp <- lapply(res500, function(x){x$rf_performance}) %>% 
            do.call(what = rbind) %>% as.data.frame %>% add_column(ntrees = 500)
all <- rbind(all, tmp)

In [20]:
all %>% group_by(ntrees) %>%
summarise_all(mean) %>%
arrange(-Kappa) %>% 
select(Accuracy, Kappa, ntrees) %>%
mutate_all(round, digits = 4)

Accuracy,Kappa,ntrees
<dbl>,<dbl>,<dbl>
0.8142,0.5779,500
0.8135,0.5765,250


In [21]:
all %>% group_by(ntrees) %>%
summarise_all(sd) %>%
arrange(-Kappa) %>% 
select(Accuracy, Kappa, ntrees)%>%
mutate_all(round, digits = 4)

Accuracy,Kappa,ntrees
<dbl>,<dbl>,<dbl>
0.0128,0.0305,250
0.0119,0.0283,500
