# Preambule

In [1]:
library(tidyverse)
library(stringr)
library(data.table)
library(stringr)
library(dplyr)
library(qs)
library(parallel)
library(clustermq)
library(ggpubr)
library(SpiecEasi)

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


The following object is masked from ‘package:pur

In [2]:
options(clustermq.scheduler = "sge", clustermq.template = "~/.clustermq.tmpl")

In [3]:
file.sources <- list.files('../scripts/', pattern = '*.R', full.names=TRUE, ignore.case=TRUE)
for (f in file.sources) {
    source(f)
}
rm('file.sources', 'f')

# Data

In [4]:
meta <- c('dataset','Sample','age','gender','country','BMI','westernized', 'number_reads')

In [5]:
rules <- qread('../tmp/ruleExec_full.qs')
X <- qread('../tmp/X.qs')

In [6]:
related_taxa <- qread('../tmp/related_taxa_full.qs')
is_s <- str_which(names(related_taxa), pattern = '^s_')
related_taxa[is_s] <- related_taxa[is_s] %>% lapply(function(x){str_subset(x, pattern = '^s|g')})
related_taxa$groupa <- 'groupa'
related_taxa$groupb <- 'groupb'
related_taxa$groupc <- 'groupc'
related_taxa$groupd <- 'groupd'

In [8]:
fnames <- list.files('/ebio/abt3_projects/temp_data/aruaud/MtgSimu50/p005_B10/', full.names = TRUE, pattern = 'simu')

In [9]:
length(fnames)

# Functions

In [11]:
getPR <- function(thr = NULL, res, related_taxa){
    
    if (is.null(thr)) {
        tmp <- res
    } else {tmp <- subset(res, val >= thr)}
    
    if (nrow(tmp) == 0){
        return(c('tp' = NA, 'fp' = NA, 'tn' = NA, 'fn' = NA) )
    }
    nodes <- unique(tmp$var)
    
    # those that should not be but are = in pred_edges but not truth
    fp <- sum(!(nodes %in% unlist(related_taxa)))
    
    # those that should be and are
    tp <- sum(sapply(related_taxa, function(x){ifelse(sum(x %in% nodes) > 0, 1, 0)}))
    
    # those that should be but are not = in truth but not in pred_edges
    #fn <- sum(sapply(names(related_taxa), function(x){!(x %in% nodes)}))
    fn <- length(which(!(names(related_taxa) %in% nodes)))
    
    res <- c('tp' = tp, 'fp' = fp, 'fn' = fn) 
    return(res)
    
}

In [12]:
wrapComp <- function(fname, data_ori, related_taxa, n_proc = 5){
    
    res <- list()
    on.exit(return(res))
    # get data 
    message('Data preparation...')
    seedOri <- as.numeric(str_extract(fname, pattern = '(?<=simu)[:digit:]+(?=\\_)'))
    res$seedOri <- seedOri
    set.seed(seedOri)
    data_ori <- data_ori[sample(1:nrow(data_ori)),]
    simu <- qread(fname)
    
    # ground truth
    tn <- unique( str_replace(unlist(simu$true_edges), pattern = '\\_{2}.*', replacement = '') ) 
    related_taxa <- related_taxa[tn] 
    
    te <- lapply(simu$true_edges, str_replace, pattern = '\\_{2}.*', replacement = '') 
    te <- unique(lapply(te, sort))
    expanded_edges <- list()
    for (i in 1:length(te)){
        tmp <- related_taxa[ te[[i]] ]
        tmp <- expand.grid(tmp[[1]], tmp[[2]])  
        tmp <- asplit(tmp, MARGIN=1)
        expanded_edges[[i]] <- sapply(lapply(tmp, sort), paste, collapse = ' - ')
        names(expanded_edges)[i] <- paste(sort(te[[i]]), collapse = ' - ')
    }
    res$expanded_edges <- expanded_edges
    res$tp_nodes <- length(tn)
    res$tp_edges <- length(te)
    res$n_nodes <- ncol(data_ori)
    res$n_edges <- (ncol(data_ori)^2 -ncol(data_ori))/2

    # spiec-easi: too long, too bad.
    X_counts <- select(data_ori, -c(number_reads)) 
    X_counts <- apply(X_counts, MARGIN = 2, FUN = function(x){x*data_ori$number_reads}) 
    X_p <- X_counts[which(simu$target == '1'), ]
    X_n <- X_counts[-which(simu$target == '1'), ]
    
    # small settings and sparCC
    message('sparCC...')
    if (!is.na(n_proc)){RhpcBLASctl::blas_set_num_threads(n_proc)}
    sparcc_p <- sparcc(X_p)
    sparcc_n <- sparcc(X_n)
    if (!is.na(n_proc)){RhpcBLASctl::blas_set_num_threads(80)}
    
    # get results
    message('Get results...')
    res_p <- data.frame('x' = character(), 'y' = character(), 'corr' = numeric(), 'covar' = numeric())
    colN <- colnames(X_p)
    nc <- ncol(X_p)
    for (i in 1:(nc-1)){
        res_p <- add_row(res_p, 'x' = rep(colN[i], nc-i)
                                   , 'y' = colN[(i+1):nc]
                                   , 'corr' = sparcc_p$Cor[(i+1):nc, i]
                                   , 'covar' = sparcc_p$Cov[(i+1):nc, i])
    }
    res_n <- data.frame('x' = character(), 'y' = character(), 'corr' = numeric(), 'covar' = numeric())
    for (i in 1:(nc-1)){
        res_n <- add_row(res_n, 'x' = rep(colN[i], nc-i)
                                   , 'y' = colN[(i+1):nc]
                                   , 'corr' = sparcc_n$Cor[(i+1):nc, i]
                                   , 'covar' = sparcc_n$Cov[(i+1):nc, i])
    }
    
    # cross 
    message('Cross...')
    in_n <- ifelse(res_n$corr >= 0, '+', '-')
    in_n <- paste0(res_n$x, in_n, res_n$y)
    in_p <- ifelse(res_p$corr >= 0, '+', '-')
    in_p <- paste0(res_p$x, in_p, res_p$y)
    res_n <- res_n[which(!(in_n %in% in_p)),]
    res_p <- res_p[which(!(in_p %in% in_n)),]
    
    res_cross <- rbind(res_p, res_n)
    res_cross$val <- res_cross$corr^2
    res_cross <- summarise_all(group_by(res_cross, x,y), mean)
    res$sparcc_crossed <- res_cross
    
    # PR curves
    # edges
    message('PR curves...')
    tmp <- unique(select(res_cross, c('x', 'y'))) 
    tmp <- asplit(as.matrix(tmp), MARGIN = 1)
    tmp <- sapply(tmp, function(x){paste(x, collapse = ' - ')})    
    res_cross$var <- tmp
    thr <- sort(unique(res_cross$val)) 
    
    pr_edges <- as.data.frame(t(sapply(thr, getPR, res=res_cross, related_taxa=expanded_edges)))
    pr_edges <- arrange(pr_edges, tp, fp) %>% group_by(fp) %>% summarise_all(max)
    
    res$edges <- pr_edges
    return(res)
    
}

# go

In [13]:
tmpl <- list(conda = "r-ml", cores = 10, job_time = '00:59:00', job_mem = '10G')

In [14]:
res <- Q(wrapComp
         , fname = fnames
         , const = list('data_ori' = X, 'related_taxa' = related_taxa, 'n_proc' = NA)
         , export = c('getPR' = getPR)
         , pkgs = c('SpiecEasi', 'qs', 'stringr', 'tidyverse')
         , n_jobs = 17
         , template = tmpl
         , log_worker=FALSE
        )

Submitting 17 worker jobs (ID: cmq7709) ...

Running 17 calculations (4 objs/8.7 Mb common; 1 calls/chunk) ...


[---------------------------------------------------]   0% (1/17 wrk) eta:  ?s

[---------------------------------------------------]   0% (2/17 wrk) eta:  ?s

[---------------------------------------------------]   0% (3/17 wrk) eta:  ?s

[---------------------------------------------------]   0% (4/17 wrk) eta:  ?s

[---------------------------------------------------]   0% (5/17 wrk) eta:  ?s

[---------------------------------------------------]   0% (6/17 wrk) eta:  ?s

[---------------------------------------------------]   0% (7/17 wrk) eta:  ?s

[---------------------------------------------------]   0% (8/17 wrk) eta:  ?s

[---------------------------------------------------]   0% (9/17 wrk) eta:  ?s

[--------------------------------------------------]   0% (10/17 wrk) eta:  ?s

[--------------------------------------------------]   0% (11/17 wrk) eta:  ?s

[------

In [14]:
qsave(res, '../tmp/comparison_sparcc.qs')