# Preambule

In [1]:
library(tidyverse)
library(stringr)
library(caret)
library(dplyr)
library('qs')
library(ranger)
library(parallel)
library(data.table)

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

    lift


qs v0.23.4.


Attaching package: ‘da

In [2]:
library(clustermq)
options(clustermq.scheduler = "sge", clustermq.template = "~/.clustermq.tmpl")

In [3]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/envs/r-ml/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] clustermq_0.8.95.1 data.table_1.13.4  ranger_0.12.1      qs_0.23.4         
 [5] caret_6.0-86       lattice_0.20-41    forcats_0.5.0      stringr_1.4.0     
 [9] dplyr_1.0.2        purrr_0.3.4        readr_1.4.0        tidyr_1.1.2       
[13] tibble_3.

# Functions

In [4]:
file.sources <- list.files('../scripts/', pattern = '*.R', full.names=TRUE, ignore.case=TRUE)
for (f in file.sources) {
    source(f)
}
rm('file.sources', 'f')

In [5]:
gammaTuning <- function(trainIx, data, meta, target, families, genera, species, params, num.trees=500){
    res <- list()
    res <- lapply(trainIx, tagRRFRanger_wcase, data=data, meta=meta, target=target
                  , families = families, genera = genera, species = species
                  , gamma=params[['gamma']], k = params[['k']], num.trees=num.trees)
    names(res) <- rep(paste0(params['gamma'], '_', params['k']), length(res))
    return(res)
}

# Data

## data

In [6]:
meta <- qread('../data/meta_pp.qs') %>% select(-all_of(c('age', 'BMI', 'genderfemale')))
tax <- qread('../data/taxa_nometh.qs') 
pth <- qread('../data/pathways_nometh.qs')
targets <- qread('../data/targets_meth.qs')

In [7]:
pp <- left_join(meta, tax, by = 'Sample') %>% left_join(pth, by = 'Sample') %>% 
            left_join(select(targets, Sample, Mtbc), by = 'Sample') 
target <- as.factor(pp$Mtbc)
pp <- pp %>% select(-Sample, -Mtbc)

In [8]:
dim(pp)

In [9]:
set.seed(0)
trainIx <- createDataPartition(y = target, times = 10, p = .7, list = TRUE)

In [10]:
meta_names <- c(str_subset(colnames(pp), pattern = 'dataset'), 'number_reads')

In [11]:
meta_names

## get taxa lists for FS

In [12]:
source('../../Common_scripts/get_taxa_lists.R')

In [13]:
tax_names <- qread('../data/taxa_table.qs')
tax_names <- mutate_all(tax_names, endoR::compatibleNames )

“replacing previous import ‘data.table::last’ by ‘dplyr::last’ when loading ‘endoR’”
“replacing previous import ‘data.table::first’ by ‘dplyr::first’ when loading ‘endoR’”
“replacing previous import ‘data.table::between’ by ‘dplyr::between’ when loading ‘endoR’”
“replacing previous import ‘dplyr::union’ by ‘igraph::union’ when loading ‘endoR’”
“replacing previous import ‘dplyr::as_data_frame’ by ‘igraph::as_data_frame’ when loading ‘endoR’”
“replacing previous import ‘dplyr::groups’ by ‘igraph::groups’ when loading ‘endoR’”


In [14]:
tax_names$f <- paste0('f_', tax_names$f)
tax_names$g <- paste0('g_', tax_names$g)
tax_names$s <- paste0('s_', tax_names$s)

In [15]:
families <- lapply(unique(tax_names$f), getFamilies, tax_names = tax_names)
names(families) <- unique(tax_names$f)
genera <- lapply(unique(tax_names$g), getGenera, tax_names = tax_names)
names(genera) <- unique(tax_names$g)
species <- lapply(unique(tax_names$s), getSpecies, tax_names = tax_names)
names(species) <- unique(tax_names$s)

# CV

In [16]:
gammas <- seq(0,1, by = 0.1)
ks <- seq(0,1,0.1)

In [17]:
params<-as.list(data.frame(t(expand.grid(gammas, ks))))

In [18]:
params <- params %>% lapply(function(x){names(x)<-c('gamma', 'k');return(x)})
names(params) <- params %>% sapply(paste, collapse = '_')

In [19]:
tmpl <- list(conda = "r-ml", cores = 10, job_time = '24:00:00', job_mem = '5G')

In [20]:
#res <- Q(gammaTuning
#  , params = params
#  , const = list('data'= pp, 'meta' = meta_names, 'target' = target, 'trainIx' = trainIx
#                 , 'families' = families,'genera' = genera, 'species' = species, num.trees = 250
#                 )
#  , export = list('tagRRFRanger_wcase' = tagRRFRanger_wcase)
#  , n_jobs= length(params)
#  , pkgs=c('caret', 'ranger', 'dplyr', 'RRF')
#  , log_worker=FALSE
#  , template = tmpl
# )

In [21]:
# qsave(res, '../tmp/ta_ranger_nt500.qs')

In [22]:
res <- qread('../tmp/ta_ranger_nt500.qs')

# res

In [23]:
all <- list()
for (i in 1:length(res)){
    all[[i]] <- as.data.frame(do.call(rbind, lapply(res[[i]], function(x) x$rf_performance)))
    all[[i]]$nconf <- sapply(res[[i]], function(x)length(x$confirmed))
    all[[i]]$gamma <- params[[i]]['gamma'] 
    all[[i]]$k <- params[[i]]['k']        
}

In [24]:
all <- do.call(rbind, all)

In [25]:
all %>% group_by(gamma, k) %>%
summarise_all(mean) %>%
arrange(-Kappa) %>% 
head(10)%>% 
select(Accuracy, Kappa, k, gamma, nconf)

Accuracy,Kappa,k,gamma,nconf
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.8256061,0.6024194,1.0,0.8,275.2
0.8254545,0.6019631,1.0,0.4,295.1
0.825303,0.6012391,1.0,0.9,274.6
0.825303,0.6004849,1.0,0.7,280.0
0.8251515,0.6002243,1.0,0.5,286.6
0.8245455,0.5989189,0.9,0.4,123.9
0.8240909,0.5986122,1.0,1.0,270.8
0.8237879,0.5971145,1.0,0.3,307.4
0.8221212,0.5947366,1.0,0.6,283.7
0.8216667,0.5937547,1.0,0.2,332.5
