From 4afec7e80cdc6ec3637874ec954e9906aa3767e8 Mon Sep 17 00:00:00 2001 From: MartinFXP Date: Thu, 16 Aug 2018 15:39:26 +0200 Subject: [PATCH] extended help file descriptions and vignette minor bugfixing for k=1 --- DESCRIPTION | 8 +- NAMESPACE | 1 + R/mnems.r | 560 +++++++++++++++++++++++++++------------------ R/mnems_low.r | 56 +++-- man/bootstrap.Rd | 18 +- man/clustNEM.Rd | 6 +- man/getAffinity.Rd | 15 +- man/getIC.Rd | 32 ++- man/hamSim.Rd | 14 +- man/mnem.Rd | 72 +++--- man/plot.mnem.Rd | 43 ++-- man/plotDnf.Rd | 26 ++- man/sim.Rd | 18 +- man/simData.Rd | 19 +- vignettes/mnem.Rmd | 369 ++++++++++++++++++++++------- 15 files changed, 808 insertions(+), 449 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5a09548..da6d017 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,15 +1,15 @@ Package: mnem Type: Package -Title: mnem -Version: 0.99.5 +Title: Mixture Nested Effects Models +Version: 0.99.6 Author: Martin Pirkl Maintainer: Martin Pirkl -Description: Given a single cell populations from different perturbation experiments mnem infers sub populations with different states for a signaling pathway. +Description: Mixture Nested Effects Models (mnem) is an extension of Nested Effects Models and allows for the analysis of single cell perturbation data provided by methods like Perturb-Seq (Dixit et al., 2016) or Crop-Seq (Datlinger et al., 2017). In those experiments each of many cells is perturbed by a knock-down of a specific gene, i.e. several cells are perturbed by a knock-down of gene A, several by a knock-down of gene B, ... and so forth. The observed read-out has to be multi-trait and in the case of the Perturb-/Crop-Seq gene are expression profiles for each cell. mnem uses a mixture model to simultaneously cluster the cell population into k clusters and and infer k networks causally linking the perturbed genes for each cluster. The mixture components are inferred via an expectation maximization algorithm. Depends: R (>= 3.5) License: GPL-3 Encoding: UTF-8 LazyData: true -biocViews: Pathways, SystemsBiology, NetworkInference, Network, RNASeq, PooledScreens, SingleCell, CRISPR, ATACSeq, DNASeq +biocViews: Pathways, SystemsBiology, NetworkInference, Network, RNASeq, PooledScreens, SingleCell, CRISPR, ATACSeq, DNASeq, GeneExpression RoxygenNote: 6.0.1 Imports: cluster, nem, epiNEM, graph, Rgraphviz, flexclust, lattice, naturalsort, snowfall, stats4, tsne, methods, graphics, stats, utils VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index ff45244..7847464 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +S3method(plot,bootmnem) S3method(plot,mnem) export(bootstrap) export(clustNEM) diff --git a/R/mnems.r b/R/mnems.r index 5cfe3c5..d095088 100644 --- a/R/mnems.r +++ b/R/mnems.r @@ -1,12 +1,7 @@ #' Simulation results #' Example data: simulation results -#' Contains simulation results. How they were -#' aquired is explained in the vignette. -#' The data conists of a list of data matrices holding -#' sensitivity and specificity -#' (spec, sens) of network edges for the variious methods compared to -#' the ground truth, sensitivity and specificity (sens2, spec2) -#' of the expected data for clustNEM and mnem. +#' Contains simulation results. +#' For details see the vignette. #' @name sim #' @docType data #' @usage sim @@ -33,13 +28,17 @@ NA #' @examples #' data(app) NA -#' Calculate responsibilities +#' Calculate responsibilities. +#' This function calculates the responsibilities +#' of each component for all cells from the expected log distribution of the +#' hidden data. #' @param x log odds for l cells and k components as a kxl matrix #' @param affinity 0 for standard soft clustering, 1 for hard clustering #' during inference (not recommended) #' @param norm if TRUE normalises to probabilities (recommended) -#' @param logtype logarithm type of the data -#' @param mw miture weights +#' @param logtype logarithm type of the data (e.g. 2 for log2 data or exp(1) +#' for natural) +#' @param mw mixture weights of the components #' @param data data in log odds #' @return responsibilities as a kxl matrix (k components, l cells) #' @author Martin Pirkl @@ -104,16 +103,28 @@ getAffinity <- function(x, affinity = 0, norm = TRUE, logtype = 2, mw = NULL, y[which(is.na(y) == TRUE)] <- 0 return(y) } -#' calculate negative penalized log likelihood +#' Calculate negative penalized log likelihood. +#' This function calculates +#' a negative penalized log likelihood given a object of class mnem. This +#' penalized likelihood is based on the normal likelihood and penalizes +#' complexity of the mixture components (i.e. the networks). #' @param x mnem object -#' @param man logical. manual data penalty -#' @param degree different degree of complexity -#' @param logtype logarithm type of the data -#' @param pen penalty weight for the data +#' @param man logical. manual data penalty, e.g. man=TRUE and pen=2 for an +#' approximation of the Akaike Information Criterion +#' @param degree different degree of penalty for complexity: positive entries +#' of transitively reduced phis or phi^r (degree=0), phi^r and mixture +#' components minus one k-1 (1), phi^r, k-1 and positive entries of thetas (2), +#' positive entries of transitively closed phis or phi^t, k-1 (3), phi^t, theta, +#' k-1 (4, default), all entries of phis, thetas and k-1 (5) +#' @param logtype logarithm type of the data (e.g. 2 for log2 data or exp(1) +#' for natural) +#' @param pen penalty weight for the data (e.g. pen=2 for approximate Akaike +#' Information Criterion) #' @param useF use F (see publication) as complexity instead of phi and theta -#' @param Fnorm normalize complexity of F +#' @param Fnorm normalize complexity of F, i.e. if two components have the +#' same entry in F, it is only counted once #' @author Martin Pirkl -#' @return penlized likelihood +#' @return penalized likelihood #' @export #' @importFrom nem transitive.closure transitive.reduction #' @examples @@ -180,47 +191,55 @@ getIC <- function(x, man = FALSE, degree = 4, logtype = 2, pen = 2, } return(ic) } -#' learn mixture of networks from a single-ceel knock-down experiment +#' Mixture NEMs - main function. +#' This function simultaneously learns a mixture +#' of causal networks and clusters of a cell population from single cell +#' perturbation data (e.g. log odds of fold change) with a mutli-trait +#' readout. E.g. Pooled CRISPR scRNA-Seq data (Perturb-Seq. Dixit et al., 2016, +#' Crop-Seq. Datlinger et al., 2017). #' @param D data with cells indexing the columns and features (E-genes) #' indexing the rows -#' @param inference inference method "em" for expectation maximization, -#' "greedy" or "genetic" +#' @param inference inference method "em" for expectation maximization #' @param search search method for single network inference "greedy", #' "exhaustive" or "modules" #' @param start A list of n lists of k networks for n starts of the EM and #' k components #' @param method "llr" for log ratios or foldchanges as input (see ratio) -#' @param parallel number of threads -#' @param reduce logical - reduce search space for exhaustive search -#' @param runs how many runs for greedy search -#' @param starts how many starts for em +#' @param parallel number of threads for parallelization of the number of +#' em runs +#' @param reduce logical - reduce search space for exhaustive search to +#' unique networks +#' @param runs number of runs for greedy search +#' @param starts number of starts for the em #' @param type initialize with "random" probabilities or "cluster" the data -#' @param p initial probabilities +#' @param p initial probabilities as a k (components) times l (cells) matrix #' @param k number of components #' @param kmax maximum number of components when k=NULL is inferred -#' @param verbose output -#' @param max_iter maximum iteration if likelihood does not converge -#' @param parallel2 if parallel=NULL, number of threads for component +#' @param verbose verbose output +#' @param max_iter maximum iteration, if likelihood does not converge +#' @param parallel2 if parallel=NULL, number of threads for single component #' optimization -#' @param converged absolute distance for convergence -#' @param redSpace space for exhaustive search -#' @param affinity 0 is default, 1 is for hard clustering -#' @param evolution logical, penalty on difference of components -#' @param subtopoX hard prior on theta +#' @param converged absolute distance for convergence between new and old log +#' likelihood +#' @param redSpace space for "exhaustive" search +#' @param affinity 0 is default for soft clustering, 1 is for hard clustering +#' @param evolution logical. If TRUE components are penelized for being +#' different from each other. +#' @param subtopoX hard prior on theta as a vector of S-genes for all E-genes #' @param ratio logical, if true data is log ratios, if false foldchanges -#' @param logtype logarithm of the data -#' @param initnets initial networks in a list -#' @param popSize population size -#' @param stallMax maximum number of stall generations till convergence -#' @param elitism number of networks to keep during evolution -#' @param maxGens maximum numbero f generations -#' @param domean average the data (speed imporvment) +#' @param logtype logarithm type of the data (e.g. 2 for log2 data or exp(1) +#' for natural) +#' @param initnets initial networks as a list of phis +#' @param domean average the data, when calculating a single NEM (speed +#' improvment) #' @param modulesize max number of S-genes per module in module search -#' @param compress compress networks after search +#' @param compress compress networks after search (warning: penelized +#' likelihood not interpretable) #' @param increase if set to FALSE, the algorithm will not stop if the #' likelihood decreases #' @author Martin Pirkl -#' @return optimized network for data fit +#' @return object of class mnem with the log expected of the hidden data +#' and phi and theta for all components k #' @export #' @import #' epiNEM @@ -248,7 +267,6 @@ mnem <- function(D, inference = "em", search = "modules", start = NULL, max_iter = 100, parallel2 = NULL, converged = 10^-1, redSpace = NULL, affinity = 0, evolution = FALSE, subtopoX = NULL, ratio = TRUE, logtype = 2, initnets = FALSE, - popSize = 10, stallMax = 2, elitism = NULL, maxGens = Inf, domean = TRUE, modulesize = 5, compress = FALSE, increase = TRUE) { if (reduce & search %in% "exhaustive" & is.null(redSpace)) { @@ -378,9 +396,9 @@ mnem <- function(D, inference = "em", search = "modules", start = NULL, ratio, mw = mw) if (getLL(probs$probs, logtype = logtype, mw = mw, data = data) > getLL(probs0$probs, - logtype = logtype, - mw = mw, - data = data)) { + logtype = logtype, + mw = mw, + data = data)) { probs0 <- probs } } @@ -426,135 +444,161 @@ mnem <- function(D, inference = "em", search = "modules", start = NULL, while ((((ll - llold > converged & increase) | (abs(ll - llold) > converged & !increase) & count < max_iter)) | time0) { - if (!time0) { - if (ll - bestll > 0) { - bestll <- ll; bestres <- res1 - bestprobs <- probs; bestmw <- mw - } - llold <- ll - probsold <- probs - } - time0 <- FALSE - res <- list() - postprobs <- getAffinity(probs, affinity = affinity, - norm = TRUE, logtype = logtype, - mw = mw, data = data) - edgechange <- 0 - thetachange <- 0 - for (i in seq_len(k)) { - if (!is.null(res1) & !("modules" %in% search)) { - start0 <- res1[[i]]$adj - } else { - start0 <- NULL - } - n <- getSgeneN(data) - dataF <- matrix(0, nrow(data), n) - colnames(dataF) <- seq_len(n) - nozero <- which(postprobs[i, ] != 0) - if (length(nozero) != 0) { - if (length(nozero) == ncol(data)) { - dataR <- data - postprobsR <- postprobs[i, ] - } else { - dataR <- cbind(data[, nozero, drop = FALSE], - dataF) - postprobsR <- c(postprobs[i, nozero], rep(0, n)) - } - } else { - dataR <- dataF - postprobsR <- rep(0, n) - } - if (is.null(start0)) { - res[[i]] <- mynem(D = dataR, weights = postprobsR, - search = search, start = start0, - method = method, - parallel = parallel2, - reduce = reduce, - runs = runs, verbose = verbose, - redSpace = redSpace, - ratio = ratio, domean = domean, - modulesize = modulesize) - } else { - test01 <- list() - test01scores <- numeric(3) - for (j in seq_len(3)) { - if (j == 3) { - start1 <- start0 - } else { - start1 <- start0*0 + (j - 1) - } - test01[[j]] <- mynem(D = dataR, - weights = postprobsR, - search = search, start = start1, - method = method, - parallel = parallel2, - reduce = reduce, - runs = runs, verbose = verbose, - redSpace = redSpace, - ratio = ratio, domean = domean, - odulesize = modulesize) - test01scores[j] <- max(test01[[j]]$scores) - } - res[[i]] <- test01[[which.max(test01scores)]] - } - edgechange <- edgechange + - sum(abs(res[[i]]$adj - res1[[i]]$adj)) - thetachange <- thetachange + - sum(res[[i]]$subtopo != res1[[i]]$subtopo) - res[[i]]$D <- NULL - res[[i]]$subweights <- NULL - - } - evopen <- 0 - if (evolution) { - res <- sortAdj(res)$res - evopen <- calcEvopen(res) - } - res1 <- res - # E-step: - n <- ncol(res[[1]]$adj) - probs0 <- list() - probs0$probs <- matrix(0, k, ncol(data)) - for (i in seq_len(3)) { - if (i == 3) { - probs <- probsold - } else { - probs <- matrix(i - 1, k, ncol(data)) - } - probs <- getProbs(probs, k, data, res, method, n, - affinity, converged, subtopoX, ratio, - mw = mw) - if (getLL(probs$probs, logtype = logtype, mw = mw, - data = data) > getLL(probs0$probs, - logtype = logtype, - mw = mw, data = data)) { - probs0 <- probs - } - } - subtopoX <- probs0$subtopoX - probs <- probs0$probs - modelsize <- n*n*k - datasize <- nrow(data)*ncol(data)*k - ll <- getLL(probs, logtype = logtype, mw = mw,data = data) + - evopen*datasize*(modelsize^-1) - mw <- apply(getAffinity(probs, affinity = affinity, - norm = TRUE, logtype = logtype, - mw = mw, data = data), 1, sum) - mw <- mw/sum(mw) - if(verbose) { - print(paste("ll: ", ll, sep = "")) - print(paste("changes in phi(s): ", - edgechange, sep = "")) - print(paste("changes in theta(s): ", - thetachange, sep = "")) - if (evolution) { - print(paste("evolution penalty: ", - evopen, sep = "")) - } - } - lls <- c(lls, ll) - count <- count + 1 - } + if (!time0) { + if (ll - bestll > 0) { + bestll <- ll; bestres <- res1 + bestprobs <- probs; bestmw <- mw + } + llold <- ll + probsold <- probs + } + time0 <- FALSE + res <- list() + postprobs <- getAffinity(probs, affinity = + affinity, + norm = TRUE, logtype = + logtype, + mw = mw, data = data) + edgechange <- 0 + thetachange <- 0 + for (i in seq_len(k)) { + if (!is.null(res1) & + !("modules" %in% search)) { + start0 <- res1[[i]]$adj + } else { + start0 <- NULL + } + n <- getSgeneN(data) + dataF <- matrix(0, nrow(data), n) + colnames(dataF) <- seq_len(n) + nozero <- which(postprobs[i, ] != 0) + if (length(nozero) != 0) { + if (length(nozero) == ncol(data)) { + dataR <- data + postprobsR <- postprobs[i, ] + } else { + dataR <- cbind(data[, nozero, + drop = FALSE], + dataF) + postprobsR <- c(postprobs[i, nozero], + rep(0, n)) + } + } else { + dataR <- dataF + postprobsR <- rep(0, n) + } + if (is.null(start0)) { + res[[i]] <- mynem(D = dataR, + weights = postprobsR, + search = search, + start = start0, + method = method, + parallel = parallel2, + reduce = reduce, + runs = runs, + verbose = verbose, + redSpace = redSpace, + ratio = ratio, + domean = domean, + modulesize = modulesize) + } else { + test01 <- list() + test01scores <- numeric(3) + for (j in seq_len(3)) { + if (j == 3) { + start1 <- start0 + } else { + start1 <- start0*0 + (j - 1) + } + test01[[j]] <- mynem(D = dataR, + weights = + postprobsR, + search = search, + start = start1, + method = method, + parallel = + parallel2, + reduce = reduce, + runs = runs, + verbose = verbose, + redSpace = + redSpace, + ratio = ratio, + domean = domean, + odulesize = + modulesize) + test01scores[j] <- + max(test01[[j]]$scores) + } + res[[i]] <- + test01[[which.max(test01scores)]] + } + edgechange <- edgechange + + sum(abs(res[[i]]$adj - res1[[i]]$adj)) + thetachange <- thetachange + + sum(res[[i]]$subtopo != res1[[i]]$subtopo) + res[[i]]$D <- NULL + res[[i]]$subweights <- NULL + + } + evopen <- 0 + if (evolution) { + res <- sortAdj(res)$res + evopen <- calcEvopen(res) + } + res1 <- res + # E-step: + n <- ncol(res[[1]]$adj) + probs0 <- list() + probs0$probs <- matrix(0, k, ncol(data)) + for (i in seq_len(3)) { + if (i == 3) { + probs <- probsold + } else { + probs <- matrix(i - 1, k, ncol(data)) + } + probs <- getProbs(probs, k, data, res, method, + n, + affinity, converged, + subtopoX, ratio, + mw = mw) + if (getLL(probs$probs, logtype = logtype, + mw = mw, + data = data) > getLL(probs0$probs, + logtype = + logtype, + mw = mw, + data = data)) { + probs0 <- probs + } + } + subtopoX <- probs0$subtopoX + probs <- probs0$probs + modelsize <- n*n*k + datasize <- nrow(data)*ncol(data)*k + ll <- getLL(probs, logtype = logtype, mw = mw, + data = data) + + evopen*datasize*(modelsize^-1) + mw <- apply(getAffinity(probs, affinity = affinity, + norm = TRUE, + logtype = logtype, + mw = mw, + data = data), 1, sum) + mw <- mw/sum(mw) + if(verbose) { + print(paste("ll: ", ll, sep = "")) + print(paste("changes in phi(s): ", + edgechange, sep = "")) + print(paste("changes in theta(s): ", + thetachange, sep = "")) + if (evolution) { + print(paste("evolution penalty: ", + evopen, sep = "")) + } + } + lls <- c(lls, ll) + count <- count + 1 + } if (ll - bestll > 0) { bestll <- ll; bestres <- res1 bestprobs <- probs; bestmw <- mw @@ -664,6 +708,7 @@ mnem <- function(D, inference = "em", search = "modules", start = NULL, lambda <- apply(postprobs, 1, sum) lambda0 <- lambda/sum(lambda) lambda <- best$mw + if (k == 1) { lambda <- 1 } if (verbose) { print(all(lambda == lambda0)) } } else { lambda <- 1 @@ -681,13 +726,18 @@ mnem <- function(D, inference = "em", search = "modules", start = NULL, class(res) <- "mnem" return(res) } -#' run bootstrap simulations on the components +#' Bootstrap. +#' Run bootstrap simulations on the components (phi) of an object of +#' class mnem. #' @param x mnem object #' @param size size of the booststrap simulations -#' @param logtype logarithm type of the data +#' @param p percentage of samples (e.g. for 100 E-genes p=0.5 means +#' sampling 50) +#' @param logtype logarithm type of the data (e.g. 2 for log2 data or exp(1) +#' for natural) #' @param ... additional parameters for hte nem function #' @author Martin Pirkl -#' @return returns bootstrap support for each edge +#' @return returns bootstrap support for each edge in each component (phi) #' @export #' @examples #' sim <- simData(Sgenes = 3, Egenes = 2, Nems = 2, mw = c(0.4,0.6)) @@ -695,8 +745,7 @@ mnem <- function(D, inference = "em", search = "modules", start = NULL, #' data <- data + rnorm(length(data), 0, 1) #' result <- mnem(data, k = 2, starts = 2) #' boot <- bootstrap(result, size = 10) -bootstrap <- function(x, size = 1000, logtype = 2, ...) { - ## bootstrap on the components to get frequencies +bootstrap <- function(x, size = 1000, p = 1, logtype = 2, ...) { data <- x$data post <- getAffinity(x$probs, logtype = logtype, mw = x$mw, data = data) bootres <- list() @@ -705,40 +754,89 @@ bootstrap <- function(x, size = 1000, logtype = 2, ...) { bootres[[i]] <- x$comp[[i]]$phi*0 for (j in seq_len(size)) { dataRR <- dataR[sample(seq_len(nrow(dataR)), - ceiling(0.5*nrow(dataR)), replace = TRUE), ] + ceiling(p*nrow(dataR)), replace = TRUE), ] res <- mynem(dataRR, ...) bootres[[i]] <- bootres[[i]]+transitive.closure(res$adj, mat = TRUE) } bootres[[i]] <- bootres[[i]]/size + colnames(bootres[[i]]) <- rownames(bootres[[i]]) <- + naturalsort(unique(colnames(data))) } + class(bootres) <- "bootmnem" return(bootres) } -#' plot mnem result +#' Plot boostrap mnem result. +#' @param x bootmnem object +#' @param reduce if TRUE transitively reduces the graphs +#' @param ... additional parameters for the plotting function plotDNF +#' @author Martin Pirkl +#' @return visualization of bootstrap mnem result with Rgraphviz +#' @export +#' @method plot bootmnem +#' @examples +#' sim <- simData(Sgenes = 3, Egenes = 2, Nems = 2, mw = c(0.4,0.6)) +#' data <- (sim$data - 0.5)/0.5 +#' data <- data + rnorm(length(data), 0, 1) +#' result <- mnem(data, k = 2, starts = 2) +#' boot <- bootstrap(result, size = 10) +#' plot(boot) +plot.bootmnem <- function(x, reduce = TRUE, ...) { + dnfs <- freqs <- NULL + for (i in seq_len(length(x))) { + adj <- adj2 <- x[[i]] + rownames(adj) <- colnames(adj) <- paste0(rownames(adj), "_", i) + adj[which(adj <= 0.5)] <- 0 + if (reduce) { + adj <- transitive.reduction(adj)*x[[i]] + } else { + adj <- x[[i]] + } + adj2 <- adj + adj2[which(adj > 0.5)] <- 0 + bidi <- which(adj2+t(adj2) > 0.5) + adj[which(adj <= 0.5)] <- 0 + adj[bidi] <- (adj2+t(adj2))[bidi] + diag(adj) <- 0 + dnf <- adj2dnf(apply(adj, c(1,2), ceiling)) + dnf <- dnf[-(1:nrow(adj))] + freq <- as.vector(t(adj)) + freq <- freq[which(freq != 0)] + dnfs <- c(dnfs, dnf) + freqs <- c(freqs, freq) + } + plotDnf(dnfs, edgelabel = freqs, ...) +} +#' Plot mnem result. #' @param x mnem object #' @param oma outer margin #' @param main main text -#' @param anno annotate cells -#' @param cexAnno text size -#' @param scale scale cells -#' @param global global clustering -#' @param egenes show egene attachments -#' @param sep seperate clusters -#' @param tsne use tsne -#' @param affinity logical use hard clustering if true -#' @param logtype logarithm type of the data -#' @param cells show ccell attachments +#' @param anno annotate cells by their perturbed gene +#' @param cexAnno text size of the cell annotations +#' @param scale scale cells to show relative and not absolute distances +#' @param global if TRUE clusters all cells, if FALSE clusters cells within +#' a component +#' @param egenes show egene attachments, i.e. number of E-genes +#' assigned to each S-gene +#' @param sep seperate clusters and not put them on top of each other +#' for better visualization +#' @param tsne if TRUE use tsne instead of pca +#' @param affinity use hard clustering if TRUE +#' @param logtype logarithm type of the data (e.g. 2 for log2 data or exp(1) +#' for natural) +#' @param cells show cell attachments, .i.e how many cells are assigned +#' to each S-gene #' @param pch cell symbol #' @param legend show legend -#' @param showdata show data if true -#' @param bestCell show probability of best fitting cell -#' @param showprobs show probabilities -#' @param shownull show null node -#' @param ratio use log ratios (if true) or foldchanges +#' @param showdata show data if TRUE +#' @param bestCell show probability of best fitting cell for each S-gene +#' @param showprobs if TRUE, shows responsibilities for all cells and components +#' @param shownull if TRUE, shows the null node +#' @param ratio use log ratios (TRUE) or foldchanges (FALSE) #' @param method "llr" for ratios -#' @param showweights shwo weights +#' @param showweights if TRUE, shows mixture weights for all components #' @param ... additional parameters #' @author Martin Pirkl -#' @return visualization of mnem result Rgraphviz +#' @return visualization of mnem result with Rgraphviz #' @export #' @method plot mnem #' @import @@ -757,7 +855,7 @@ bootstrap <- function(x, size = 1000, logtype = 2, ...) { #' result <- mnem(data, k = 2, starts = 2) #' plot(result) plot.mnem <- function(x, oma = c(3,1,1,3), main = "M&NEM", anno = TRUE, - # import deleted graphics + # import deleted graphics cexAnno = 1, scale = NULL, global = TRUE, egenes = TRUE, sep = FALSE, tsne = FALSE, affinity = 0, logtype = 2, cells = TRUE, pch = ".", legend = FALSE, showdata = FALSE, @@ -799,7 +897,7 @@ plot.mnem <- function(x, oma = c(3,1,1,3), main = "M&NEM", anno = TRUE, Cells = 0.5, Fit = 0.5), nodewidth = list(Sgenes = 2, Egenes = 0.5, Cells = 0.5, Fit = 0.5), - layout = "circo") + layout = "circo") } full <- x$comp[[1]]$phi mixnorm <- getAffinity(x$probs, affinity = affinity, norm = TRUE, @@ -1074,7 +1172,8 @@ Mixture weight: ", round(x$mw[i], 3)*100, "%", sep = "") } } } -#' clusters the data and performs standard nem on each cluster +#' Cluster NEM. +#' This function clusters the data and performs standard nem on each cluster. #' @param data data of log ratios with cells in columns and features in rows #' @param k number of clusters #' @param ... additional arguments for standard nem function @@ -1144,19 +1243,21 @@ clustNEM <- function(data, k = 2:5, ...) { res$probs <- matrix(res$mw, K, ncol(data)) return(res) } -#' simulate single cell data from a mixture of networks +#' Simulate data. This function simulates single cell data from a random +#' mixture of networks. #' @param Sgenes number of Sgenes #' @param Egenes number of Egenes #' @param subsample range to subsample data. 1 means the full simulated data is #' used. -#' @param Nems numberof components -#' @param reps number of relicates, if set (not realistice for cells) -#' @param mw mixture weights -#' @param evolution evovling network, if set to true +#' @param Nems number of components +#' @param reps number of replicates, if set (not realistic for cells) +#' @param mw mixture weights (has to be vector of length Nems) +#' @param evolution evovling and not purely random network, if set to TRUE #' @param nCells number of cells #' @param uninform number of uninformative Egenes -#' @param unitheta uniform theta, if true -#' @param edgeprob edge probability +#' @param unitheta uniform theta, if TRUE +#' @param edgeprob edge probability, value between 0 and 1 for sparse or +#' dense networks #' @author Martin Pirkl #' @return simulation object with meta information and data #' @export @@ -1256,16 +1357,20 @@ simData <- function(Sgenes = 5, Egenes = 1, subsample = 1, ncol(data)*uninform, replace = TRUE), uninform, ncol(data))) } - return(list(Nem = Nem, theta = theta, data = data, index = index)) + sim <- list(Nem = Nem, theta = theta, data = data, index = index) + class(sim) <- "mnemsim" + return(sim) } -#' normalized hamming accuracy of the columns of two adjacency matrices -#' @param a adjacency matrix -#' @param b adjacency matrix +#' Accuracy for two phis. +#' This function uses the hamming distance to calculate +#' an accuracy for two networks (phi). +#' @param a adjacency matrix (phi) +#' @param b adjacency matrix (phi) #' @param diag if 1 includes diagonal in distance, if 0 not #' @param symmetric comparing a to b is asymmetrical, if TRUE includes #' comparison b to a #' @author Martin Pirkl -#' @return normalized hamming accuracy +#' @return normalized hamming accuracy for a and b #' @export #' @import #' epiNEM @@ -1314,24 +1419,29 @@ hamSim <- function(a, b, diag = 1, symmetric = TRUE) { ham <- min(c(ham2, ham3)) return(ham) } -#' function for visualizing graphs in normal form -#' @param dnf Hyper-graph in disjunctive normal form. -#' @param freq Frequency of hyper-edges. -#' @param stimuli Vertices which can be stimulated. -#' @param signals Vertices which regulate E-genes. -#' @param inhibitors Vertices which can be inhibited. +#' Plot disjunctive normal form. +#' This function visualizes a graph encodedas a disjunctive nromal form. +#' @param dnf Hyper-graph in disjunctive normal form, +#' e.g. c("A=B", "A=C+D", "E=!B") with the child on the left and the parents +#' on the right of the equation with "A=C+D" for A = C AND D. Alternatively, +#' dnf can be an adjacency matrix, which is converted on the fly to a +#' disjunctive normal form. +#' @param freq Frequency of hyper-edges which are placed on the edges. +#' @param stimuli Highlights vertices which can be stimulated. +#' @param signals Highlights vertices which regulate E-genes. +#' @param inhibitors Highlights vertices which can be inhibited. #' @param connected If TRUE, only includes vertices which are connected to other #' vertices. #' @param CNOlist CNOlist object. Optional instead of stimuli, inhibitors or -#' signals. +#' signals. See package CellNOptR. #' @param cex Global font size. #' @param fontsize Vertice label size. #' @param labelsize Edge label size. -#' @param type Different plot types. +#' @param type Different plot types. 2 for Rgraphviz and 1 for graph. #' @param lwd Line width. #' @param edgelwd Edgeline width. #' @param legend 0 shows no legend. 1 shows legend as a graph. 2 shows legend -#' in a box. +#' in a standard box. #' @param x x coordinate of box legend. #' @param y y coordinate of box legend. #' @param xjust Justification of legend box left, right or center (-1,1,0). @@ -2563,7 +2673,7 @@ plotDnf <- function(dnf = NULL, freq = NULL, stimuli = c(), signals = c(), g@renderInfo@nodes$labelX[ which(names(g@renderInfo@nodes$labelX) %in% i)] + 200 - g@renderInfo@nodes$rWidth[[i]] - 10, - length.out = 4)) + length.out = 4)) tmp.splines[2] <- tmp.splines[2] + 50 tmp.splines <- as.integer(tmp.splines) g@renderInfo@edges$splines[[paste(tmp.name, "~", i, @@ -2602,7 +2712,7 @@ plotDnf <- function(dnf = NULL, freq = NULL, stimuli = c(), signals = c(), g@renderInfo@nodes$labelY <- c(g@renderInfo@nodes$labelY, g@renderInfo@nodes$labelY[ - which(names(g@renderInfo@nodes$labelY) %in% i)]) + which(names(g@renderInfo@nodes$labelY) %in% i)]) names(g@renderInfo@nodes$labelX)[ length(g@renderInfo@nodes$labelX)] <- paste(i, "_inhibited", sep = "") diff --git a/R/mnems_low.r b/R/mnems_low.r index e642921..8a42ee9 100644 --- a/R/mnems_low.r +++ b/R/mnems_low.r @@ -762,19 +762,17 @@ mynem <- function(D, search = "greedy", start = NULL, method = "llr", subweights <- Dw%*%cbind(tmp$phi[colnames(Dw), ], 0) subtopo <- apply(subweights, 1, which.max) } - + if (!is.null(parallel)) { sfStop() } - - if (is.null(subtopo)) { - subtopo <- scoreAdj(D, better, method = method, weights = weights, - subtopo = subtopo, prior = prior, - ratio = ratio) - subweights <- subtopo$subweights - subtopo <- subtopo$subtopo - } - + + subtopo <- scoreAdj(D, better, method = method, weights = weights, + subtopo = subtopo, prior = prior, + ratio = ratio) + subweights <- subtopo$subweights + subtopo <- subtopo$subtopo + better <- transitive.reduction(better) better <- better[order(as.numeric(rownames(better))), order(as.numeric(colnames(better)))] @@ -786,24 +784,24 @@ mynem <- function(D, search = "greedy", start = NULL, method = "llr", #' @noRd adj2dnf <- function(A) { - dnf <- NULL - - for (i in seq_len(ncol(A))) { - for (j in seq_len(nrow(A))) { - if (i %in% j) { next() } - if (A[i, j] == 1) { - dnf <- c(dnf, paste(colnames(A)[i], rownames(A)[j], sep = "=")) - } - if (A[i, j] == -1) { - dnf <- c(dnf, paste("!", colnames(A)[i], "=", - rownames(A)[j], sep = "")) - } - } - } + dnf <- NULL + + for (i in seq_len(ncol(A))) { + for (j in seq_len(nrow(A))) { + if (i %in% j) { next() } + if (A[i, j] == 1) { + dnf <- c(dnf, paste(colnames(A)[i], rownames(A)[j], sep = "=")) + } + if (A[i, j] == -1) { + dnf <- c(dnf, paste("!", colnames(A)[i], "=", + rownames(A)[j], sep = "")) + } + } + } - dnf <- unique(dnf) - - return(dnf) + dnf <- unique(dnf) + + return(dnf) } #' @noRd @@ -869,8 +867,8 @@ scoreAdj <- function(D, adj, method = "llr", weights = NULL, #' @noRd adj2dnf <- function(A) { - dnf <- NULL - + dnf <- NULL + for (i in seq_len(ncol(A))) { dnf <- c(dnf, rownames(A)) for (j in seq_len(nrow(A))) { diff --git a/man/bootstrap.Rd b/man/bootstrap.Rd index 7d393fe..fe3196d 100644 --- a/man/bootstrap.Rd +++ b/man/bootstrap.Rd @@ -2,24 +2,32 @@ % Please edit documentation in R/mnems.r \name{bootstrap} \alias{bootstrap} -\title{run bootstrap simulations on the components} +\title{Bootstrap. +Run bootstrap simulations on the components (phi) of an object of +class mnem.} \usage{ -bootstrap(x, size = 1000, logtype = 2, ...) +bootstrap(x, size = 1000, p = 1, logtype = 2, ...) } \arguments{ \item{x}{mnem object} \item{size}{size of the booststrap simulations} -\item{logtype}{logarithm type of the data} +\item{p}{percentage of samples (e.g. for 100 E-genes p=0.5 means +sampling 50)} + +\item{logtype}{logarithm type of the data (e.g. 2 for log2 data or exp(1) +for natural)} \item{...}{additional parameters for hte nem function} } \value{ -returns bootstrap support for each edge +returns bootstrap support for each edge in each component (phi) } \description{ -run bootstrap simulations on the components +Bootstrap. +Run bootstrap simulations on the components (phi) of an object of +class mnem. } \examples{ sim <- simData(Sgenes = 3, Egenes = 2, Nems = 2, mw = c(0.4,0.6)) diff --git a/man/clustNEM.Rd b/man/clustNEM.Rd index ca14b12..1dc3f32 100644 --- a/man/clustNEM.Rd +++ b/man/clustNEM.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/mnems.r \name{clustNEM} \alias{clustNEM} -\title{clusters the data and performs standard nem on each cluster} +\title{Cluster NEM. +This function clusters the data and performs standard nem on each cluster.} \usage{ clustNEM(data, k = 2:5, ...) } @@ -17,7 +18,8 @@ clustNEM(data, k = 2:5, ...) family of nems } \description{ -clusters the data and performs standard nem on each cluster +Cluster NEM. +This function clusters the data and performs standard nem on each cluster. } \examples{ sim <- simData(Sgenes = 3, Egenes = 2, Nems = 2, mw = c(0.4,0.6)) diff --git a/man/getAffinity.Rd b/man/getAffinity.Rd index 0c2dd78..c2e81f6 100644 --- a/man/getAffinity.Rd +++ b/man/getAffinity.Rd @@ -2,7 +2,10 @@ % Please edit documentation in R/mnems.r \name{getAffinity} \alias{getAffinity} -\title{Calculate responsibilities} +\title{Calculate responsibilities. +This function calculates the responsibilities +of each component for all cells from the expected log distribution of the +hidden data.} \usage{ getAffinity(x, affinity = 0, norm = TRUE, logtype = 2, mw = NULL, data = matrix(0, 2, ncol(x))) @@ -15,9 +18,10 @@ during inference (not recommended)} \item{norm}{if TRUE normalises to probabilities (recommended)} -\item{logtype}{logarithm type of the data} +\item{logtype}{logarithm type of the data (e.g. 2 for log2 data or exp(1) +for natural)} -\item{mw}{miture weights} +\item{mw}{mixture weights of the components} \item{data}{data in log odds} } @@ -25,7 +29,10 @@ during inference (not recommended)} responsibilities as a kxl matrix (k components, l cells) } \description{ -Calculate responsibilities +Calculate responsibilities. +This function calculates the responsibilities +of each component for all cells from the expected log distribution of the +hidden data. } \examples{ sim <- simData(Sgenes = 3, Egenes = 2, Nems = 2, mw = c(0.4,0.6)) diff --git a/man/getIC.Rd b/man/getIC.Rd index c870557..3cc01c0 100644 --- a/man/getIC.Rd +++ b/man/getIC.Rd @@ -2,7 +2,11 @@ % Please edit documentation in R/mnems.r \name{getIC} \alias{getIC} -\title{calculate negative penalized log likelihood} +\title{Calculate negative penalized log likelihood. +This function calculates +a negative penalized log likelihood given a object of class mnem. This +penalized likelihood is based on the normal likelihood and penalizes +complexity of the mixture components (i.e. the networks).} \usage{ getIC(x, man = FALSE, degree = 4, logtype = 2, pen = 2, useF = FALSE, Fnorm = FALSE) @@ -10,23 +14,35 @@ getIC(x, man = FALSE, degree = 4, logtype = 2, pen = 2, useF = FALSE, \arguments{ \item{x}{mnem object} -\item{man}{logical. manual data penalty} +\item{man}{logical. manual data penalty, e.g. man=TRUE and pen=2 for an +approximation of the Akaike Information Criterion} -\item{degree}{different degree of complexity} +\item{degree}{different degree of penalty for complexity: positive entries +of transitively reduced phis or phi^r (degree=0), phi^r and mixture +components minus one k-1 (1), phi^r, k-1 and positive entries of thetas (2), +positive entries of transitively closed phis or phi^t, k-1 (3), phi^t, theta, +k-1 (4, default), all entries of phis, thetas and k-1 (5)} -\item{logtype}{logarithm type of the data} +\item{logtype}{logarithm type of the data (e.g. 2 for log2 data or exp(1) +for natural)} -\item{pen}{penalty weight for the data} +\item{pen}{penalty weight for the data (e.g. pen=2 for approximate Akaike +Information Criterion)} \item{useF}{use F (see publication) as complexity instead of phi and theta} -\item{Fnorm}{normalize complexity of F} +\item{Fnorm}{normalize complexity of F, i.e. if two components have the +same entry in F, it is only counted once} } \value{ -penlized likelihood +penalized likelihood } \description{ -calculate negative penalized log likelihood +Calculate negative penalized log likelihood. +This function calculates +a negative penalized log likelihood given a object of class mnem. This +penalized likelihood is based on the normal likelihood and penalizes +complexity of the mixture components (i.e. the networks). } \examples{ sim <- simData(Sgenes = 3, Egenes = 2, Nems = 2, mw = c(0.4,0.6)) diff --git a/man/hamSim.Rd b/man/hamSim.Rd index 544ec08..2540a88 100644 --- a/man/hamSim.Rd +++ b/man/hamSim.Rd @@ -2,14 +2,16 @@ % Please edit documentation in R/mnems.r \name{hamSim} \alias{hamSim} -\title{normalized hamming accuracy of the columns of two adjacency matrices} +\title{Accuracy for two phis. +This function uses the hamming distance to calculate +an accuracy for two networks (phi).} \usage{ hamSim(a, b, diag = 1, symmetric = TRUE) } \arguments{ -\item{a}{adjacency matrix} +\item{a}{adjacency matrix (phi)} -\item{b}{adjacency matrix} +\item{b}{adjacency matrix (phi)} \item{diag}{if 1 includes diagonal in distance, if 0 not} @@ -17,10 +19,12 @@ hamSim(a, b, diag = 1, symmetric = TRUE) comparison b to a} } \value{ -normalized hamming accuracy +normalized hamming accuracy for a and b } \description{ -normalized hamming accuracy of the columns of two adjacency matrices +Accuracy for two phis. +This function uses the hamming distance to calculate +an accuracy for two networks (phi). } \examples{ sim <- simData(Sgenes = 3, Egenes = 2, Nems = 2, mw = c(0.4,0.6)) diff --git a/man/mnem.Rd b/man/mnem.Rd index dac36d1..b4ef776 100644 --- a/man/mnem.Rd +++ b/man/mnem.Rd @@ -2,23 +2,26 @@ % Please edit documentation in R/mnems.r \name{mnem} \alias{mnem} -\title{learn mixture of networks from a single-ceel knock-down experiment} +\title{Mixture NEMs - main function. +This function simultaneously learns a mixture +of causal networks and clusters of a cell population from single cell +perturbation data (e.g. log odds of fold change) with a mutli-trait +readout. E.g. Pooled CRISPR scRNA-Seq data (Perturb-Seq. Dixit et al., 2016, +Crop-Seq. Datlinger et al., 2017).} \usage{ mnem(D, inference = "em", search = "modules", start = NULL, method = "llr", parallel = NULL, reduce = FALSE, runs = 1, starts = 3, type = "random", p = NULL, k = NULL, kmax = 10, verbose = FALSE, max_iter = 100, parallel2 = NULL, converged = 10^-1, redSpace = NULL, affinity = 0, evolution = FALSE, subtopoX = NULL, - ratio = TRUE, logtype = 2, initnets = FALSE, popSize = 10, - stallMax = 2, elitism = NULL, maxGens = Inf, domean = TRUE, + ratio = TRUE, logtype = 2, initnets = FALSE, domean = TRUE, modulesize = 5, compress = FALSE, increase = TRUE) } \arguments{ \item{D}{data with cells indexing the columns and features (E-genes) indexing the rows} -\item{inference}{inference method "em" for expectation maximization, -"greedy" or "genetic"} +\item{inference}{inference method "em" for expectation maximization} \item{search}{search method for single network inference "greedy", "exhaustive" or "modules"} @@ -28,67 +31,72 @@ k components} \item{method}{"llr" for log ratios or foldchanges as input (see ratio)} -\item{parallel}{number of threads} +\item{parallel}{number of threads for parallelization of the number of +em runs} -\item{reduce}{logical - reduce search space for exhaustive search} +\item{reduce}{logical - reduce search space for exhaustive search to +unique networks} -\item{runs}{how many runs for greedy search} +\item{runs}{number of runs for greedy search} -\item{starts}{how many starts for em} +\item{starts}{number of starts for the em} \item{type}{initialize with "random" probabilities or "cluster" the data} -\item{p}{initial probabilities} +\item{p}{initial probabilities as a k (components) times l (cells) matrix} \item{k}{number of components} \item{kmax}{maximum number of components when k=NULL is inferred} -\item{verbose}{output} +\item{verbose}{verbose output} -\item{max_iter}{maximum iteration if likelihood does not converge} +\item{max_iter}{maximum iteration, if likelihood does not converge} -\item{parallel2}{if parallel=NULL, number of threads for component +\item{parallel2}{if parallel=NULL, number of threads for single component optimization} -\item{converged}{absolute distance for convergence} +\item{converged}{absolute distance for convergence between new and old log +likelihood} -\item{redSpace}{space for exhaustive search} +\item{redSpace}{space for "exhaustive" search} -\item{affinity}{0 is default, 1 is for hard clustering} +\item{affinity}{0 is default for soft clustering, 1 is for hard clustering} -\item{evolution}{logical, penalty on difference of components} +\item{evolution}{logical. If TRUE components are penelized for being +different from each other.} -\item{subtopoX}{hard prior on theta} +\item{subtopoX}{hard prior on theta as a vector of S-genes for all E-genes} \item{ratio}{logical, if true data is log ratios, if false foldchanges} -\item{logtype}{logarithm of the data} +\item{logtype}{logarithm type of the data (e.g. 2 for log2 data or exp(1) +for natural)} -\item{initnets}{initial networks in a list} +\item{initnets}{initial networks as a list of phis} -\item{popSize}{population size} - -\item{stallMax}{maximum number of stall generations till convergence} - -\item{elitism}{number of networks to keep during evolution} - -\item{maxGens}{maximum numbero f generations} - -\item{domean}{average the data (speed imporvment)} +\item{domean}{average the data, when calculating a single NEM (speed +improvment)} \item{modulesize}{max number of S-genes per module in module search} -\item{compress}{compress networks after search} +\item{compress}{compress networks after search (warning: penelized +likelihood not interpretable)} \item{increase}{if set to FALSE, the algorithm will not stop if the likelihood decreases} } \value{ -optimized network for data fit +object of class mnem with the log expected of the hidden data +and phi and theta for all components k } \description{ -learn mixture of networks from a single-ceel knock-down experiment +Mixture NEMs - main function. +This function simultaneously learns a mixture +of causal networks and clusters of a cell population from single cell +perturbation data (e.g. log odds of fold change) with a mutli-trait +readout. E.g. Pooled CRISPR scRNA-Seq data (Perturb-Seq. Dixit et al., 2016, +Crop-Seq. Datlinger et al., 2017). } \examples{ sim <- simData(Sgenes = 3, Egenes = 2, Nems = 2, mw = c(0.4,0.6)) diff --git a/man/plot.mnem.Rd b/man/plot.mnem.Rd index 7fa7a64..c84a910 100644 --- a/man/plot.mnem.Rd +++ b/man/plot.mnem.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/mnems.r \name{plot.mnem} \alias{plot.mnem} -\title{plot mnem result} +\title{Plot mnem result.} \usage{ \method{plot}{mnem}(x, oma = c(3, 1, 1, 3), main = "M&NEM", anno = TRUE, cexAnno = 1, scale = NULL, global = TRUE, egenes = TRUE, @@ -18,51 +18,56 @@ \item{main}{main text} -\item{anno}{annotate cells} +\item{anno}{annotate cells by their perturbed gene} -\item{cexAnno}{text size} +\item{cexAnno}{text size of the cell annotations} -\item{scale}{scale cells} +\item{scale}{scale cells to show relative and not absolute distances} -\item{global}{global clustering} +\item{global}{if TRUE clusters all cells, if FALSE clusters cells within +a component} -\item{egenes}{show egene attachments} +\item{egenes}{show egene attachments, i.e. number of E-genes +assigned to each S-gene} -\item{sep}{seperate clusters} +\item{sep}{seperate clusters and not put them on top of each other +for better visualization} -\item{tsne}{use tsne} +\item{tsne}{if TRUE use tsne instead of pca} -\item{affinity}{logical use hard clustering if true} +\item{affinity}{use hard clustering if TRUE} -\item{logtype}{logarithm type of the data} +\item{logtype}{logarithm type of the data (e.g. 2 for log2 data or exp(1) +for natural)} -\item{cells}{show ccell attachments} +\item{cells}{show cell attachments, .i.e how many cells are assigned +to each S-gene} \item{pch}{cell symbol} \item{legend}{show legend} -\item{showdata}{show data if true} +\item{showdata}{show data if TRUE} -\item{bestCell}{show probability of best fitting cell} +\item{bestCell}{show probability of best fitting cell for each S-gene} -\item{showprobs}{show probabilities} +\item{showprobs}{if TRUE, shows responsibilities for all cells and components} -\item{shownull}{show null node} +\item{shownull}{if TRUE, shows the null node} -\item{ratio}{use log ratios (if true) or foldchanges} +\item{ratio}{use log ratios (TRUE) or foldchanges (FALSE)} \item{method}{"llr" for ratios} -\item{showweights}{shwo weights} +\item{showweights}{if TRUE, shows mixture weights for all components} \item{...}{additional parameters} } \value{ -visualization of mnem result Rgraphviz +visualization of mnem result with Rgraphviz } \description{ -plot mnem result +Plot mnem result. } \examples{ sim <- simData(Sgenes = 3, Egenes = 2, Nems = 2, mw = c(0.4,0.6)) diff --git a/man/plotDnf.Rd b/man/plotDnf.Rd index 4869069..b1e57eb 100644 --- a/man/plotDnf.Rd +++ b/man/plotDnf.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/mnems.r \name{plotDnf} \alias{plotDnf} -\title{function for visualizing graphs in normal form} +\title{Plot disjunctive normal form. +This function visualizes a graph encodedas a disjunctive nromal form.} \usage{ plotDnf(dnf = NULL, freq = NULL, stimuli = c(), signals = c(), inhibitors = c(), connected = TRUE, CNOlist = NULL, cex = NULL, @@ -20,21 +21,25 @@ plotDnf(dnf = NULL, freq = NULL, stimuli = c(), signals = c(), draw = TRUE, ...) } \arguments{ -\item{dnf}{Hyper-graph in disjunctive normal form.} +\item{dnf}{Hyper-graph in disjunctive normal form, +e.g. c("A=B", "A=C+D", "E=!B") with the child on the left and the parents +on the right of the equation with "A=C+D" for A = C AND D. Alternatively, +dnf can be an adjacency matrix, which is converted on the fly to a +disjunctive normal form.} -\item{freq}{Frequency of hyper-edges.} +\item{freq}{Frequency of hyper-edges which are placed on the edges.} -\item{stimuli}{Vertices which can be stimulated.} +\item{stimuli}{Highlights vertices which can be stimulated.} -\item{signals}{Vertices which regulate E-genes.} +\item{signals}{Highlights vertices which regulate E-genes.} -\item{inhibitors}{Vertices which can be inhibited.} +\item{inhibitors}{Highlights vertices which can be inhibited.} \item{connected}{If TRUE, only includes vertices which are connected to other vertices.} \item{CNOlist}{CNOlist object. Optional instead of stimuli, inhibitors or -signals.} +signals. See package CellNOptR.} \item{cex}{Global font size.} @@ -42,14 +47,14 @@ signals.} \item{labelsize}{Edge label size.} -\item{type}{Different plot types.} +\item{type}{Different plot types. 2 for Rgraphviz and 1 for graph.} \item{lwd}{Line width.} \item{edgelwd}{Edgeline width.} \item{legend}{0 shows no legend. 1 shows legend as a graph. 2 shows legend -in a box.} +in a standard box.} \item{x}{x coordinate of box legend.} @@ -139,7 +144,8 @@ E.g. list(top = c("A", "B"), bottom = c("C", "D")).} Rgraphviz object } \description{ -function for visualizing graphs in normal form +Plot disjunctive normal form. +This function visualizes a graph encodedas a disjunctive nromal form. } \examples{ g <- c("!A+B=G", "C=G", "!D=G", "C+D+E=G") diff --git a/man/sim.Rd b/man/sim.Rd index 8b5ca40..c2d22d7 100644 --- a/man/sim.Rd +++ b/man/sim.Rd @@ -5,26 +5,16 @@ \alias{sim} \title{Simulation results Example data: simulation results -Contains simulation results. How they were -aquired is explained in the vignette. -The data conists of a list of data matrices holding -sensitivity and specificity -(spec, sens) of network edges for the variious methods compared to -the ground truth, sensitivity and specificity (sens2, spec2) -of the expected data for clustNEM and mnem.} +Contains simulation results. +For details see the vignette.} \usage{ sim } \description{ Simulation results Example data: simulation results -Contains simulation results. How they were -aquired is explained in the vignette. -The data conists of a list of data matrices holding -sensitivity and specificity -(spec, sens) of network edges for the variious methods compared to -the ground truth, sensitivity and specificity (sens2, spec2) -of the expected data for clustNEM and mnem. +Contains simulation results. +For details see the vignette. } \examples{ data(sim) diff --git a/man/simData.Rd b/man/simData.Rd index ac38817..3c06a87 100644 --- a/man/simData.Rd +++ b/man/simData.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/mnems.r \name{simData} \alias{simData} -\title{simulate single cell data from a mixture of networks} +\title{Simulate data. This function simulates single cell data from a random +mixture of networks.} \usage{ simData(Sgenes = 5, Egenes = 1, subsample = 1, Nems = 2, reps = NULL, mw = NULL, evolution = FALSE, nCells = 1000, uninform = 0, @@ -16,27 +17,29 @@ simData(Sgenes = 5, Egenes = 1, subsample = 1, Nems = 2, reps = NULL, \item{subsample}{range to subsample data. 1 means the full simulated data is used.} -\item{Nems}{numberof components} +\item{Nems}{number of components} -\item{reps}{number of relicates, if set (not realistice for cells)} +\item{reps}{number of replicates, if set (not realistic for cells)} -\item{mw}{mixture weights} +\item{mw}{mixture weights (has to be vector of length Nems)} -\item{evolution}{evovling network, if set to true} +\item{evolution}{evovling and not purely random network, if set to TRUE} \item{nCells}{number of cells} \item{uninform}{number of uninformative Egenes} -\item{unitheta}{uniform theta, if true} +\item{unitheta}{uniform theta, if TRUE} -\item{edgeprob}{edge probability} +\item{edgeprob}{edge probability, value between 0 and 1 for sparse or +dense networks} } \value{ simulation object with meta information and data } \description{ -simulate single cell data from a mixture of networks +Simulate data. This function simulates single cell data from a random +mixture of networks. } \examples{ sim <- simData(Sgenes = 3, Egenes = 2, Nems = 2, mw = c(0.4,0.6)) diff --git a/vignettes/mnem.Rmd b/vignettes/mnem.Rmd index 7a961e8..7f9b940 100644 --- a/vignettes/mnem.Rmd +++ b/vignettes/mnem.Rmd @@ -1,6 +1,6 @@ --- title: "Mixture Nested Effects Models \n -Simulataneous inferring of causal networks and corresponding subpopulations +Simultaneous inference of causal networks and corresponding subpopulations from single cells perturbation data." author: "Martin Pirkl, Niko Beerenwinkel" date: "`r Sys.Date()`" @@ -12,72 +12,197 @@ vignette: > %\VignetteEncoding{UTF-8} --- -# Mixture Nested Effects Models (M&NEM) +# Mixture Nested Effects Models (M\&NEM) Single cell RNA-seq data sets from pooled CrispR screens provide the possibility -to analyzse heterogeneous cell populations. We extended the original -Nested Effects Models (NEM) to Mixture Nested Effects Models (M&NEM) to -simulataneously identify several causal signalling graphs and +to analyze heterogeneous cell populations. We extended the original +Nested Effects Models (NEM) to Mixture Nested Effects Models (M\&NEM) to +simultaneously identify several causal signaling graphs and corresponding subpopulations of cells. The final result will be a soft -clustering of the perturbed cells and a causal signalling graph, which -describes the interactions of the perturbed genens for each cluster of -cells. +clustering of the perturbed cells and a causal signaling graph, which +describes the interactions of the perturbed signaling genes (S-genes) for +each cluster of cells and the sub-topology for the observed genes (E-genes). + +The M\&NEM algorithm uses an expectation maximization (EM) algorithm to +infer an optimum for $k$ components. In the E-step the expectation of +the hidden data (assignment of a cell to a component aka responsibilities) +is calculated. Based on the responsibilities M\&NEM weights the data for +each component and the standard NEM approach is used to optimize the +causal network for each weighted data set. ## Installation and loading -```{r, global_options, include=FALSE} -knitr::opts_chunk$set(message=FALSE) +```{r global_options, include=FALSE} +knitr::opts_chunk$set(message=FALSE, out.width="125%", fig.align="center", + strip.white=TRUE, warning=FALSE, tidy=TRUE, + #out.extra='style="display:block; margin:auto;"', + fig.height = 4, fig.width = 8, error=FALSE) +fig.cap0 <- "Heatmap of the simulated log odds. Effects are blue and no effects +are red." +fig.cap01 <- "Mixture NEM result for the simulated data. On top we show +the cell assignments for hard clustering and the mixture weights. The large +circular vertices depict the S-genes, the small ones the responsibility for +the best fitting cell, the diamonds the number of assigned cells and the boxes +the number of assigned E-genes." +fig.cap02 <- "Ground truth causal networks of our simulated mixture of cells" fig.cap1 <- "Accuracy of the inferred networks. The rows denote the number -of components of the groudn truth (1 to 5). The accuracy of mixture nems is -shown in red, for a naive clustering approach in blue and for standard nem +of components of the ground truth (1 to 5). The accuracy of mixture NEMs is +shown in red, for a naive clustering approach in blue and for standard NEM with one component in grey." fig.cap2 <- "Accuracy of the inferred number of components. The rows denote the number -of components of the groudn truth (1 to 5). The accuracy of mixture nems is -shown in red, for a naive clustering approach in blue and for standard nem +of components of the ground truth (1 to 5). The accuracy of mixture NEMs is +shown in red, for a naive clustering approach in blue and for standard NEM with one component in grey." fig.cap3 <- "Accuracy of the mixture weights. The rows denote the number -of components of the groudn truth (1 to 5). The accuracy of mixture nems is -shown in red, for a naive clustering approach in blue and for standard nem +of components of the ground truth (1 to 5). The accuracy of mixture NEMs is +shown in red, for a naive clustering approach in blue and for standard NEM with one component in grey." -fig.cap4 <- "As a comparison to the inferred mixture wieght we simulated +fig.cap4 <- "As a comparison to the inferred mixture weight we simulated random mixture weights for different components." -fig.cap5 <- "Penelized and raw log likelihood ratios. Blue denotes the raw -loglikelihood and red the negative penalization for complexity for the CROPSeq +fig.cap5 <- "Penalized and raw log likelihood ratios. Blue denotes the raw +log likelihood and red the negative penalized for complexity for the CROPSeq (A) and the two PERTURBSeq datasets (B,C)." -fig.cap6 <- "Histograms of the responsibilities. The responsibilties for the -highest scoring according to the penelized loglikelihood for the CROPSeq (A) +fig.cap6 <- "Histograms of the responsibilities. The responsibilities for the +highest scoring according to the penalized log likelihood for the CROPSeq (A) the two PERTURBSeq datasets (B,C)." -fig.cap7 <- "Highest scoring mixture for the CROPSeq dataset." -fig.cap8 <- "Second highest scoring mixture for the CROPSeq dataset." +fig.cap7 <- "Highest scoring mixture for the CROPSeq dataset. On top we show +the cell assignments for hard clustering and the mixture weights. The large +circular vertices depict the S-genes, the small ones the responsibility for +the best fitting cell, the diamonds the number of assigned cells and the boxes +the number of assigned E-genes." +fig.cap8 <- "Second highest scoring mixture for the CROPSeq dataset. On top we +show the cell assignments for hard clustering and the mixture weights. The large +circular vertices depict the S-genes, the small ones the responsibility for +the best fitting cell, the diamonds the number of assigned cells and the boxes +the number of assigned E-genes." fig.cap9 <- "Highest scoring mixture for the PERTURBSeq dataset of cell -cycle regulators." +cycle regulators. On top we show +the cell assignments for hard clustering and the mixture weights. The large +circular vertices depict the S-genes, the small ones the responsibility for +the best fitting cell, the diamonds the number of assigned cells and the boxes +the number of assigned E-genes." fig.cap10 <- "Second highest scoring mixture for the PERTURBSeq dataset of cell -cycle regulators." +cycle regulators. On top we show +the cell assignments for hard clustering and the mixture weights. The large +circular vertices depict the S-genes, the small ones the responsibility for +the best fitting cell, the diamonds the number of assigned cells and the boxes +the number of assigned E-genes." fig.cap11 <- "Highest scoring mixture for the PERTURBSeq dataset of -transcription factors." +transcription factors. On top we show +the cell assignments for hard clustering and the mixture weights. The large +circular vertices depict the S-genes, the small ones the responsibility for +the best fitting cell, the diamonds the number of assigned cells and the boxes +the number of assigned E-genes." fig.cap12 <- "Second highest scoring mixture for the PERTURBSeq dataset of -transcription factors." +transcription factors. On top we show +the cell assignments for hard clustering and the mixture weights. The large +circular vertices depict the S-genes, the small ones the responsibility for +the best fitting cell, the diamonds the number of assigned cells and the boxes +the number of assigned E-genes." +paltmp <- palette() +paltmp[3] <- "blue" +paltmp[4] <- "brown" +palette(paltmp) ``` Install the package with the bioconductor function. ```{r, eval=FALSE} source("https://bioconductor.org/biocLite.R") biocLite("mnem") ``` -Load the package with the library function. +Load the package with the library function. ```{r} library(mnem) ``` - + +## Mini example + +The input data is an m times n matrix of log odds (e.g. for fold changes as in + the R package "limma"). The *i*th row of the *j*th column are the log odds + for feature (E-gene) i in cell j. As in the case of the original NEM, the + data consists of a multitude of E-genes. However, where for the original + NEM only one column complies to each perturbed signaling gene (S-gene), + M\&NEM can is designed for single cell data and therefore can handle and + draws its power from multiple cells for each perturbed S-gene. + +The M\&NEM package includes a functions for the simulation of typical single + cell log odds. We use this function to create data for three S-genes and + eight E-genes. Two E-genes for each S-gene and two uninformative E-genes. + Additionally we assume generate the data from a mixture of two components + and sample $100$ cells, $20$ for one and $80$ for the other component + (approx. mixture weights: mw). The function simulates discreet data ($1$ + for effect and $0$ for no effect). We transform the discreet data to log + odds by adding Gaussian with mean $1$ to all $1$s and with mean $-1$ to + all $0$s. Figure 1 shows a heatmap of our generated data.Since we used + only little noise, effects and no effects are still clearly distinguished + into shades of blue and red. We can identify the uninformative E-genes + (no rownames) by their random/unique patterns. Cells perturbed by S-gene + $1$ all behave similarly. However, cells perturbed by S-genes $2$ and $3$ + fall into two clusters each hinting at differential causal regulation + within the cell population. +```{r, fig.height=6, fig.width=10, fig.cap=fig.cap0} +set.seed(9247) +simmini <- simData(Sgenes = 3, Egenes = 10, + Nems = 2, mw = c(0.2, 0.8), nCells = 100, uninform = 2) +data <- simmini$data +ones <- which(data == 1) +zeros <- which(data == 0) +data[ones] <- rnorm(length(ones), 1, 0.1) +data[zeros] <- rnorm(length(zeros), -1, 0.1) +epiNEM::HeatmapOP(data, col = "RdBu", cexRow = 0.75, cexCol = 0.75, + bordercol = "transparent", xrot = 0, dendrogram = "both") +``` + +We "forget" the solution stored in sim$Nem and use the 'mnem' function to infer + the network. We choose our a greedy search for the M-step and $10$ + independent starts. Since the number of components $k$ is a priori + not known, we perform optimization for $k=1,2,3$ and use our penalized + log likelihood to choose the best $k$. Figure 2 shows the best mixture of + our causal networks with $k=2$. We indeed have found the same causal + networks we used to simulate the data. Notice, that our input for the + mixture weights is only an approximation, since the real mixture of the + data is dependent on the ground truth networks (e.g. how similar they are) + and noise. +```{r, fig.height=6, fig.width=14, fig.cap=fig.cap01} +mlist <- list() +mpen <- numeric(3) +for (k in 1:3) { + mlist[[k]] <- mnem(data, k = k, search = "greedy", runs = 10) + mpen[k] <- getIC(mlist[[k]]) +} +plot(mlist[[which.min(mpen)]]) +``` +```{r, fig.cap=fig.cap02, out.width="75%"} +par(mfrow=c(1,2)) +plotDnf(simmini$Nem[[1]], nodelabels=list(A="1", B="2", C="3"), + bordercol="red", main = "Mixture weigth: 20%") +plotDnf(simmini$Nem[[2]], nodelabels=list(A="1", B="2", C="3"), + bordercol="blue", main = "Mixture weigth: 80%") +``` + ## Simulations -We simulate cells based on a ground truth mixture of networks. We use M&NEM to -infer an optimal network from the data and compare the result ot the ground +We simulate cells based on a ground truth mixture of networks. We use M\&NEM to +infer an optimal network from the data and compare the result to the ground truth. We also look at the accuracy of the mixture weights and the the number -of networks K.For large scale simulation see section 'Data generation' at the -end of the vignette. +of networks $k$. See also section 'Data generation' at the end of the vignette. Simulation results are evaluated by accuracy of the networks, mixture weights and number of components. +The data object 'sim' contains four lists with results of mixture with +$3$, $5$, $10$ and $20$ S-genes. Each list consists of an array including $100$ +runs for three different noise levels, five different numbers of +components $(1,2,3,4,5)$ and three different methods, which results in overall +$2400$ runs. For each run the array includes values for time, +over-fit, component accuracy (accuracy) and mixing weight accuracy +(mixing). + +The different methods we compared are besides M\&NEM, random drawing of of the +causal network and a naive cluster approach (cNEM). For cNEM we first +cluster (k-means) all cells based on a correlation distance and then use the +standard NEM approach on each cluster. + +We show the accuracy plots for for the components, number of components +and mixture weights. ```{r, fig.height=8, fig.width=10, fig.cap=fig.cap1} data(sim) noises <- c(1,2.5,5) @@ -130,7 +255,9 @@ for (i in 1:4) { at = par("usr")[1] - (par("usr")[2]-par("usr")[1])*(3.1 - (i-1)*1.2)) } ``` -```{r, fig.height=8, fig.width=10, fig.cap=fig.cap2} +For the code of the latter two (only minmal difference in code) see the R code + of this vignette. +```{r, fig.height=8, fig.width=10, fig.cap=fig.cap2, echo=FALSE} noises <- c(1,2.5,5) nems <- 1:5 simres3 <- sim[[1]] @@ -180,7 +307,7 @@ for (i in 1:4) { at = par("usr")[1] - (par("usr")[2]-par("usr")[1])*(3.1 - (i-1)*1.2)) } ``` -```{r, fig.height=8, fig.width=10, fig.cap=fig.cap3} +```{r, fig.height=8, fig.width=10, fig.cap=fig.cap3, echo=FALSE} noises <- c(1,2.5,5) nems <- 1:5 simres3 <- sim[[1]] @@ -234,8 +361,7 @@ for (i in 1:4) { We randomly draw mixture weights and components as a comparison to our inferred weights. - -```{r, fig.width = 4, fig.height = 4, warning=FALSE, fig.cap = fig.cap4} +```{r, fig.width = 4, fig.height = 4, fig.cap = fig.cap4, out.width="60%"} runs <- 100 maxk <- 5 mixrand <- matrix(0, runs, maxk) @@ -257,14 +383,31 @@ for (j in 1:5) { boxplot(mixrand, col = "grey", main = "random", xlab = "components K") ``` -## Application to pooled CRISPR screens from CROPSeq and PERTURBSeq +Figures 4-7 show that M\&NEM performs better as a random and the naive +clustering approach when it comes to overall accuracy of the +causal networks, number of components and mixture weights. -We apply M&NEM to three different data sets from two different pooled CRISPR -screens, Crop-Seq (Datlinger *et al.*, 2017) and -Perturb-Seq (Dixit *et al.*, 2016). +## Application to pooled CRISPR screens from CROPSeq and PERTURBSeq -We optimized the mixture for components K=1,2,3,4,5. Hence, we use our penelized -likelihood to finde the optimal number of components without overfitting. +We apply M\&NEM to three different data sets from two different pooled CRISPR +screens, CROPSeq (Datlinger *et al.*, 2017) and +PERTURBSeq (Dixit *et al.*, 2016). Those screen consists of single cell RNAseq +data. Each cell has been perturbed by a knock-down of a gene +(S-gene) and each cell is observed by its gene expression profile +(E-genes). + +We optimized the mixture for components $k=1,2,3,4,5$. Hence, we use our +penalized likelihood to find the optimal number of components without +over-fitting. + +The data object 'app' consists of three lists containing the results +for CROPSeq, PERTURBSeq (transcription factors) and PERTURBSeq +(cell cycle regulators). Each list contains the results for +$k=1,2,3,4,5$ (number of components). + +For each of the three data sets we show the raw log likelihood +together with the penalized likelihood. We choose the optimal $k$ at +the minimum of the penalized log likelihood. ```{r, fig.height = 3, fig.width = 11, fig.cap = fig.cap5} data(app) res2 <- app @@ -298,17 +441,15 @@ for (j in 1:3) { at = par("usr")[1] - (par("usr")[2]-par("usr")[1])*0.27) } -plot(0, xlim = c(1,100), ylim = c(90,100), xaxt = "n", yaxt = "n", xlab = "", - ylab = "") -legend(1, 100, c("raw log likelihood ratio", "penalized log likelihood ratio"), - border = "transparent", cex = 1.25, lty = 1, pch = 1, - col = c("blue", "red")) ``` - -Each cell has a certain probability to have been generated by a components -(responsibility). The histograms show the responsibilities for all three -data sets. - +Figure 8 shows that for each data set, the optimal mixture is $k=2$ according to +our penalized log likelihood. However, while for the first and second +data set even $k=2$ beats just a single network, for the third data set +there is only weak evidence for a mixture of $k>1$. + +Each cell has a certain probability (responsibility) to have been +generated by a components. The histograms show the distribution of +the responsibilities for all cells for the three data sets. ```{r, fig.height = 3, fig.width = 11, fig.cap = fig.cap6} par(mfrow=c(1,3), oma = c(0,0,1,0), mar = rep(0,4)) for (j in 1:3) { @@ -335,36 +476,97 @@ for (j in 1:3) { at = par("usr")[1] - (par("usr")[2]-par("usr")[1])*0.27) } ``` - -We show the highest and second highest scoring networks for each data set. - -```{r, fig.height = 10, fig.width = 30, fig.cap = fig.cap7} -paltmp <- palette() -paltmp[3] <- "blue" -paltmp[4] <- "brown" -palette(paltmp) -par(mfrow=c(6,1), oma = c(0,0,1,0), mar = rep(0,4)) -for (j in 1:3) { - res <- res2[[j]] - for (i in 2:5) { - res[[i]]$data <- res[[1]]$data - } - bics <- rep(0, maxk) - ll <- rep(0, maxk) - for (i in seq_len(maxk)) { - bics[i] <- getIC(res[[i]]) - ll[i] <- max(res[[i]]$ll) - } - ll2 <- ll - ll <- (ll/(max(ll)-min(ll)))*(max(bics)-min(bics)) - ll <- ll - min(ll) + min(bics) - ll3 <- seq(min(bics), max(bics[!is.infinite(bics)]), length.out = 5) - i <- which.min(bics) - bics[i] <- bics[which.max(bics)] - i2 <- which.min(bics) - plot(res[[i]]) - plot(res[[i2]]) +Figure 9 shows the distribution of responsibilities for all three data sets. +The responsibilities for the first data set are almost binary, i.e. +almost each cell belongs to $100\%$ to one mixture component. For the +other two data sets there is a much more smooth transition. While +there are still many cells with a responsibility of $100\%$, many cells +have one around $50\%$. For the third data set even more cells have +responsibilities between $40\%$ and $50\%$. This is in accordance to the +penalized log likelihood and is evidence, that the two mixture +components are very similar and most of the causal network is stable +across cells. + +We show the highest and second highest scoring networks for each data set. On +top is the percentage of cells assigned to each network by hard clustering +and the mixture weights. The connected large circles depict the causal +network of S-genes. The small circles show the responsibility of the best +fitting cell. The boxes show the number E-genes assigned to each S-gene and +the diamonds show the numbers of cells assigned to each S-gene. The NULL +node is assigned E-genes, which fit best to an S-gene (NULL) with no effects +at all. For the R code of the second and third data sets see the accompanyning +R file (only the "j <- 1" needs to be changed to 2 or 3). +```{r, fig.height = 8, fig.width = 16, fig.cap = fig.cap7} +j <- 1 +res <- res2[[j]] +for (i in 2:5) { + res[[i]]$data <- res[[1]]$data +} +bics <- rep(0, maxk) +ll <- rep(0, maxk) +for (i in seq_len(maxk)) { + bics[i] <- getIC(res[[i]]) + ll[i] <- max(res[[i]]$ll) +} +ll2 <- ll +ll <- (ll/(max(ll)-min(ll)))*(max(bics)-min(bics)) +ll <- ll - min(ll) + min(bics) +ll3 <- seq(min(bics), max(bics[!is.infinite(bics)]), length.out = 5) +i <- which.min(bics) +bics[i] <- bics[which.max(bics)] +i2 <- which.min(bics) +plot(res[[i]]) +``` +```{r, fig.height = 8, fig.width = 20, fig.cap = fig.cap7} +plot(res[[i2]]) +``` +```{r, fig.height = 10, fig.width = 20, fig.cap = fig.cap7, echo = FALSE} +j <- 2 +res <- res2[[j]] +for (i in 2:5) { + res[[i]]$data <- res[[1]]$data } +bics <- rep(0, maxk) +ll <- rep(0, maxk) +for (i in seq_len(maxk)) { + bics[i] <- getIC(res[[i]]) + ll[i] <- max(res[[i]]$ll) +} +ll2 <- ll +ll <- (ll/(max(ll)-min(ll)))*(max(bics)-min(bics)) +ll <- ll - min(ll) + min(bics) +ll3 <- seq(min(bics), max(bics[!is.infinite(bics)]), length.out = 5) +i <- which.min(bics) +bics[i] <- bics[which.max(bics)] +i2 <- which.min(bics) +plot(res[[i]]) +``` +```{r, fig.height = 10, fig.width = 25, fig.cap = fig.cap7, echo = FALSE} +plot(res[[i2]]) +``` +```{r, fig.height = 8, fig.width = 16, fig.cap = fig.cap7, echo = FALSE} +j <- 3 +res <- res2[[j]] +for (i in 2:5) { + res[[i]]$data <- res[[1]]$data +} +bics <- rep(0, maxk) +ll <- rep(0, maxk) +for (i in seq_len(maxk)) { + bics[i] <- getIC(res[[i]]) + ll[i] <- max(res[[i]]$ll) +} +ll2 <- ll +ll <- (ll/(max(ll)-min(ll)))*(max(bics)-min(bics)) +ll <- ll - min(ll) + min(bics) +ll3 <- seq(min(bics), max(bics[!is.infinite(bics)]), length.out = 5) +i <- which.min(bics) +bics[i] <- bics[which.max(bics)] +i2 <- which.min(bics) +plot(res[[i]]) +``` +```{r, fig.height = 8, fig.width = 8, fig.cap = fig.cap7, echo = FALSE} +plot(res[[i2]]) ``` # Session information @@ -395,8 +597,7 @@ Cell, 167(7), 1853???1866.e17. # Generating the data objects -The following code was used to generate the data for the simulation and - application results. See the accompanying R code. +The R code for generating the data objects is shown in the accompanying R file. ```{r, eval=FALSE, include=FALSE} data <- read.csv("GSE92872_CROP-seq_Jurkat_TCR.digital_expression.csv",