# Preambule

In [1]:
library(tidyverse)
library(stringr)
library(caret)
library(dplyr)
library('qs')

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

    lift


qs v0.23.4.



In [2]:
library(clustermq)
options(clustermq.scheduler = "sge", clustermq.template = "~/.clustermq.tmpl")

In [3]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/envs/r-ml/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] clustermq_0.8.95.1 qs_0.23.4          caret_6.0-86       lattice_0.20-41   
 [5] forcats_0.5.0      stringr_1.4.0      dplyr_1.0.2        purrr_0.3.4       
 [9] readr_1.4.0        tidyr_1.1.2        tibble_3.0.4       ggplot2_3.3.2     
[13] tidyverse_1.3.0   

loa

# Functions

In [4]:
file.sources <- list.files('../scripts/', pattern = '*.R', full.names=TRUE, ignore.case=TRUE)
for (f in file.sources) {
    source(f)
}
rm('file.sources', 'f')

In [5]:
source('../../Common_scripts/get_taxa_lists.R')

# Data

## data

In [6]:
meta <- qread('../data/meta_pp.qs') %>% select(-all_of(c('age', 'BMI', 'genderfemale')))
tax <- qread('../data/taxa_nometh.qs') 
pth <- qread('../data/pathways_nometh.qs')
targets <- qread('../data/targets_meth.qs')

In [7]:
pp <- left_join(meta, tax, by = 'Sample') %>% left_join(pth, by = 'Sample') %>% 
            left_join(select(targets, Sample, Mtbc), by = 'Sample') 
target <- as.factor(pp$Mtbc)
pp <- pp %>% select(-Sample, -Mtbc)

In [8]:
dim(pp)

In [9]:
set.seed(0)
trainIx <- createDataPartition(y = target, times = 10, p = .7, list = TRUE)

In [10]:
meta_names <- c(str_subset(colnames(pp), pattern = 'dataset'), 'number_reads')

In [11]:
meta_names

## get taxa lists for FS

In [12]:
tax_names <- qread('../data/taxa_table.qs')
tax_names <- mutate_all(tax_names, endoR::compatibleNames )

“replacing previous import ‘data.table::last’ by ‘dplyr::last’ when loading ‘endoR’”
“replacing previous import ‘data.table::first’ by ‘dplyr::first’ when loading ‘endoR’”
“replacing previous import ‘data.table::between’ by ‘dplyr::between’ when loading ‘endoR’”
“replacing previous import ‘dplyr::union’ by ‘igraph::union’ when loading ‘endoR’”
“replacing previous import ‘dplyr::as_data_frame’ by ‘igraph::as_data_frame’ when loading ‘endoR’”
“replacing previous import ‘dplyr::groups’ by ‘igraph::groups’ when loading ‘endoR’”


In [13]:
tax_names$f <- paste0('f_', tax_names$f)
tax_names$g <- paste0('g_', tax_names$g)
tax_names$s <- paste0('s_', tax_names$s)

In [14]:
families <- lapply(unique(tax_names$f), getFamilies, tax_names = tax_names)
names(families) <- unique(tax_names$f)
genera <- lapply(unique(tax_names$g), getGenera, tax_names = tax_names)
names(genera) <- unique(tax_names$g)
species <- lapply(unique(tax_names$s), getSpecies, tax_names = tax_names)
names(species) <- unique(tax_names$s)

# CV

In [22]:
gammas <- seq(0,1, by = 0.1)
ks <- seq(0,1,0.1)
fs_param<-as.list(data.frame(t(expand.grid(gammas, ks)))) %>% 
            lapply(function(x){names(x) <- c('gamma', 'k');return(x)})

In [16]:
nrounds <- c(10, 50, 100, 250, 500, 750, 1000, 1500)
max_depth <- 1:10
xgboost_param <- as.list(data.frame(t(expand.grid(nrounds, max_depth))))%>% 
                    lapply(function(x){names(x) <- c('nrounds', 'max_depth');return(x)})

In [15]:
class_weights <- round(sum(target == levels(target)[1])/length(target), digits = 2)
case_weights <- ifelse(target == levels(target)[1], class_weights[2], class_weights[1])  

In [16]:
tmpl <- list(conda = "r-ml", cores = 10, job_time = '24:00:00', job_mem = '10G')

In [17]:
## to run on HPC
#tuning_res <- Q(gammaTuning
#  , fs_param = fs_param
#  , const = list('trainIx' = trainIx, 'data'= pp, 'meta' = meta_names, 'target' = target
#                 , 'case_weights'=case_weights
#                 , 'families' = families,'genera' = genera, 'species' = species
#                 , 'xgboost_param' = xgboost_param)
#  , export = list('tagRRFxgboost' = tagRRFxgboost)
#  , n_jobs= length(fs_param)
#  , pkgs=c('caret', 'xgboost', 'dplyr', 'RRF')
#  , log_worker=FALSE
#  , template = tmpl
# )

In [18]:
# qsave(tuning_res, '../tmp/All_ta-gRRF_xgboost.qs')

In [19]:
tuning_res <- qread('../tmp/All_ta-gRRF_xgboost.qs')

# res

In [20]:
getCV <- function(res){
    eachIx <- list()
    for (i in 1:length(res)){
        tmp <- res[[i]]$tuned_xgb
        #tmp <- lapply(tmp, function(x){x$xgboost_performance})
        tmp <- as.data.frame(do.call(rbind, tmp))
        tmp$max_depth <- str_extract(names(res[[i]]$tuned_xgb), pattern = '(?<=maxdepth).*$') %>% as.numeric
        tmp$nrounds <- str_extract(names(res[[i]]$tuned_xgb), pattern = '^.*(?=nrounds)') %>% as.numeric
        tmp$nconfirmed <- length( res[[i]]$confirmed )
        eachIx[[i]] <- tmp
    }
    
    eachIx <- do.call(rbind, eachIx)
    return(eachIx)
}

In [23]:
res_cv <- lapply(tuning_res, getCV)
for (i in 1:length(res_cv)){
    res_cv[[i]]$gamma <- fs_param[[i]]['gamma']
    res_cv[[i]]$k <- fs_param[[i]]['k']
}
res_cv <- do.call(rbind, res_cv)

In [24]:
res_m <- res_cv %>% group_by(gamma,k, max_depth, nrounds) %>% summarise_all(mean)
res_sd <- res_cv %>% group_by(gamma,k, max_depth, nrounds) %>% summarise_all(sd)

res_summary <- left_join(res_m, res_sd, by = c('gamma','k', 'max_depth', 'nrounds'), suffix = c('_avg', '_sd'))

In [33]:
res_summary %>% select(gamma,k, max_depth, nrounds,Kappa_avg, Kappa_sd, Accuracy_avg, Accuracy_sd
                      , nconfirmed_avg, nconfirmed_sd) %>% 
arrange(-Kappa_avg) %>% head(10)

gamma,k,max_depth,nrounds,Kappa_avg,Kappa_sd,Accuracy_avg,Accuracy_sd,nconfirmed_avg,nconfirmed_sd
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0,0.0,1,250,0.5747835,0.008837095,0.8116667,0.003034509,883.2,24.88105
0,0.1,1,250,0.5747835,0.008837095,0.8116667,0.003034509,883.2,24.88105
0,0.2,1,250,0.5747835,0.008837095,0.8116667,0.003034509,883.2,24.88105
0,0.3,1,250,0.5747835,0.008837095,0.8116667,0.003034509,883.2,24.88105
0,0.4,1,250,0.5747835,0.008837095,0.8116667,0.003034509,883.2,24.88105
0,0.5,1,250,0.5747835,0.008837095,0.8116667,0.003034509,883.2,24.88105
0,0.6,1,250,0.5747835,0.008837095,0.8116667,0.003034509,883.2,24.88105
0,0.7,1,250,0.5747835,0.008837095,0.8116667,0.003034509,883.2,24.88105
0,0.8,1,250,0.5747835,0.008837095,0.8116667,0.003034509,883.2,24.88105
0,0.9,1,250,0.5747835,0.008837095,0.8116667,0.003034509,883.2,24.88105
