# Preambule

In [1]:
library(tidyverse)
library(stringr)
library(caret)
library(dplyr)
library('qs')
library(data.table)

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

    lift


qs v0.23.4.


Attaching package: ‘da

In [2]:
library(clustermq)
options(clustermq.scheduler = "sge", clustermq.template = "~/.clustermq.tmpl")

In [3]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/envs/r-ml/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] clustermq_0.8.95.1 data.table_1.13.4  qs_0.23.4          caret_6.0-86      
 [5] lattice_0.20-41    forcats_0.5.0      stringr_1.4.0      dplyr_1.0.2       
 [9] purrr_0.3.4        readr_1.4.0        tidyr_1.1.2        tibble_3.0.4      
[13] ggplot2_3.3.2      tidy

# Functions

In [4]:
file.sources <- list.files('../scripts/', pattern = '*.R', full.names=TRUE, ignore.case=TRUE)
for (f in file.sources) {
    source(f)
}
rm('file.sources', 'f')

In [5]:
source('../../Common_scripts/get_taxa_lists.R')

In [6]:
gammaTuning <- function(trainIx, data, meta, target, families, genera, species, params, num.trees=500){
    res <- list()
    res <- lapply(trainIx, tagRRFRanger_wcase, data=data, meta=meta, target=target
                  , families = families, genera = genera, species = species
                  , gamma=params[['gamma']], k = params[['k']], num.trees=num.trees)
    names(res) <- rep(paste0(params['gamma'], '_', params['k']), length(res))
    return(res)
}

# Data

## data

In [7]:
meta <- qread('../data/meta_pp.qs')
tax <- qread('../data/taxa_nometh.qs') 
pth <- qread('../data/pathways_nometh.qs')
targets <- qread('../data/targets_meth.qs')

In [8]:
cat('Number of samples = ', nrow(meta))
pp <- meta[ complete.cases(meta), ]
cat('.. only complete cases for age, BMI, etc = ', nrow(pp))

Number of samples =  2203.. only complete cases for age, BMI, etc =  748

In [9]:
pp <- left_join(meta, tax, by = 'Sample') %>% left_join(pth, by = 'Sample') %>% 
            left_join(select(targets, Sample, Mtbc), by = 'Sample') 
target <- as.factor(pp$Mtbc)
pp <- pp %>% select(-Sample, -Mtbc)

In [10]:
dim(pp)

In [11]:
set.seed(0)
trainIx <- createDataPartition(y = target, times = 10, p = .7, list = TRUE)

In [12]:
meta_names <- c(str_subset(colnames(pp), pattern = 'dataset'), 'number_reads')

## get taxa lists for FS

In [13]:
tax_names <- qread('../data/taxa_table.qs')
tax_names <- mutate_all(tax_names, endoR::compatibleNames )

“replacing previous import ‘data.table::last’ by ‘dplyr::last’ when loading ‘endoR’”
“replacing previous import ‘data.table::first’ by ‘dplyr::first’ when loading ‘endoR’”
“replacing previous import ‘data.table::between’ by ‘dplyr::between’ when loading ‘endoR’”
“replacing previous import ‘dplyr::union’ by ‘igraph::union’ when loading ‘endoR’”
“replacing previous import ‘dplyr::as_data_frame’ by ‘igraph::as_data_frame’ when loading ‘endoR’”
“replacing previous import ‘dplyr::groups’ by ‘igraph::groups’ when loading ‘endoR’”


In [14]:
tax_names$f <- paste0('f_', tax_names$f)
tax_names$g <- paste0('g_', tax_names$g)
tax_names$s <- paste0('s_', tax_names$s)

In [15]:
families <- lapply(unique(tax_names$f), getFamilies, tax_names = tax_names)
names(families) <- unique(tax_names$f)
genera <- lapply(unique(tax_names$g), getGenera, tax_names = tax_names)
names(genera) <- unique(tax_names$g)
species <- lapply(unique(tax_names$s), getSpecies, tax_names = tax_names)
names(species) <- unique(tax_names$s)

# CV

In [16]:
gammas <- seq(0,1, by = 0.1)
ks <- seq(0,1,0.1)

In [17]:
params<-as.list(data.frame(t(expand.grid(gammas, ks))))
params <- params %>% lapply(function(x){names(x)<-c('gamma', 'k');return(x)})
names(params) <- params %>% sapply(paste, collapse = '_')

In [18]:
tmpl <- list(conda = "r-ml", cores = 10, job_time = '2:00:00', job_mem = '5G')

In [19]:
# res <- Q(gammaTuning
#  , params = params
#  , const = list('data'= pp, 'meta' = meta_names, 'target' = target, 'trainIx' = trainIx
#                 , 'families' = families,'genera' = genera, 'species' = species)
#  , export = list('tagRRFRanger_wcase' = tagRRFRanger_wcase)
#  , n_jobs= length(params)
#  , pkgs=c('caret', 'ranger', 'dplyr', 'RRF')
#  , log_worker=FALSE
#  , template = tmpl
# )

In [20]:
#qsave(res, '../tmp/Sub_ta-gRRF_range.qs')

In [22]:
res <- qread('../tmp/Sub_ta-gRRF_range.qs')

# check

In [23]:
all <- list()
for (i in 1:length(res)){
    all[[i]] <- as.data.frame(do.call(rbind, lapply(res[[i]], function(x) x$rf_performance)))
    all[[i]]$nconf <- sapply(res[[i]], function(x)length(x$confirmed))
    all[[i]]$gamma <- params[[i]]['gamma'] 
    all[[i]]$k <- params[[i]]['k']        
}

In [24]:
all <- do.call(rbind, all)

In [25]:
all %>% group_by(gamma, k) %>%
summarise_all(mean) %>%
arrange(-Kappa) %>% 
head(3)%>% 
select(Accuracy, Kappa, k, gamma, nconf)

Accuracy,Kappa,k,gamma,nconf
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.8013393,0.551446,0.8,0.2,99.7
0.7982143,0.5422721,0.8,0.1,156.9
0.7973214,0.5394769,0.1,0.1,121.1


In [26]:
all %>% group_by(gamma, k) %>%
summarise_all(sd) %>%
subset(k == 0.8 & gamma == 0.2) %>% 
head(3) %>% 
select(Accuracy, Kappa, k, gamma, nconf)

Accuracy,Kappa,k,gamma,nconf
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.02879762,0.06425079,0.8,0.2,4.595892


# BMI and age selected?

In [27]:
length(meta_names)

In [28]:
meta_c <- list()
for (i in 1:length(res)){
    bmi_c <- sapply(res[[i]], function(x) sum(x$confirmed[-c(1:length(meta_names))] == 'BMI') )
    age_c <- sapply(res[[i]], function(x) sum(x$confirmed[-c(1:length(meta_names))] == 'age') )
    gen_c <- sapply(res[[i]], function(x) sum(x$confirmed[-c(1:length(meta_names))] == 'genderfemale') ) 
    meta_c[[i]] <- c('bmi' = mean(bmi_c), 'age' = mean(age_c), 'gender' = mean(gen_c)
                     , 'k' = params[[i]]['k'] , 'gamma' = params[[i]]['gamma'] )
}

In [29]:
meta_c <- as.data.frame(do.call(rbind, meta_c))
colnames(meta_c)[4:5] <- c('k', 'gamma')

In [30]:
meta_c %>% subset(k == 0.8 & gamma == 0.2)
meta_c %>% summary

Unnamed: 0_level_0,bmi,age,gender,k,gamma
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
91,0,0,0,0.8,0.2


      bmi         age        gender        k           gamma    
 Min.   :0   Min.   :0   Min.   :0   Min.   :0.0   Min.   :0.0  
 1st Qu.:0   1st Qu.:0   1st Qu.:0   1st Qu.:0.2   1st Qu.:0.2  
 Median :0   Median :0   Median :0   Median :0.5   Median :0.5  
 Mean   :0   Mean   :0   Mean   :0   Mean   :0.5   Mean   :0.5  
 3rd Qu.:0   3rd Qu.:0   3rd Qu.:0   3rd Qu.:0.8   3rd Qu.:0.8  
 Max.   :0   Max.   :0   Max.   :0   Max.   :1.0   Max.   :1.0  