From 09c66824a5f5ef320ecf859d91a26ea11abb735b Mon Sep 17 00:00:00 2001 From: Irisapo Date: Fri, 12 Apr 2019 10:25:10 -0400 Subject: [PATCH 001/149] output est-phi, est-eta --- R/decon.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/decon.R b/R/decon.R index 91684660..6312d8dd 100644 --- a/R/decon.R +++ b/R/decon.R @@ -317,10 +317,10 @@ DecontXoneBatch = function(counts, z=NULL, batch=NULL, max.iter=200, beta=1e-6, run.params = list("beta.init"=beta, "delta.init"=delta.init, "iteration"=iter-1L, "seed"=seed) res.list = list("logLikelihood" = ll, "est.nativeCounts"=next.decon$est.rmat , "est.conp"= res.conp, "theta"=theta , "delta"=delta) - #if( decon.method=="clustering" ) { - # posterior.params = list( "est.GeneDist"=phi, "est.ConDist"=eta ) - # res.list = append( res.list , posterior.params ) - #} + if( decon.method=="clustering" ) { + posterior.params = list( "est.GeneDist"=phi, "est.ConDist"=eta ) + res.list = append( res.list , posterior.params ) + } return(list("run.params"=run.params, "res.list"=res.list, "method"=decon.method )) } From 118d7dccae732e871ec392c66622a97413abf432 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Tue, 23 Apr 2019 15:40:15 -0400 Subject: [PATCH 002/149] decontX vignettes change all DecontX to decontX --- vignettes/DecontX-analysis.Rmd | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vignettes/DecontX-analysis.Rmd b/vignettes/DecontX-analysis.Rmd index ca3facf1..e991344a 100644 --- a/vignettes/DecontX-analysis.Rmd +++ b/vignettes/DecontX-analysis.Rmd @@ -1,21 +1,21 @@ --- -title: "Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with DecontX" +title: "Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with decontX" author: "Shiyi Yang, Sean Corbett, Yusuke Koga, Zhe Wang, W. Evan Johnson, Masanao Yajima, Joshua D. Campbell" date: "`r Sys.Date()`" output: BiocStyle::html_document: toc: true vignette: > - %\VignetteIndexEntry{Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with DecontX} + %\VignetteIndexEntry{Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with decontX} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- # Introduction -DecontX is a Bayesian hierarchical model to estimate and remove cross-contamination from ambient RNA in single-cell RNA-seq count data generated from droplet-based sequencing devices. DecontX will take the count matrix with/without the cell labels and estimate the contamination level and deliver a decontaminted count matrix for downstream analysis. +decontX is a Bayesian hierarchical model to estimate and remove cross-contamination from ambient RNA in single-cell RNA-seq count data generated from droplet-based sequencing devices. DecontX will take the count matrix with/without the cell labels and estimate the contamination level and deliver a decontaminted count matrix for downstream analysis. -In this vignette we will demonstrate how to use DecontX to estimate and remove contamination. +In this vignette we will demonstrate how to use decontX to estimate and remove contamination. The package can be loaded using the `library` command. @@ -27,7 +27,7 @@ To see the latest updates and releases or to post a bug, see our GitHub page at # Generation of a cross-contaminated dataset -DecontX will take a matrix of counts (referred as observed counts) where each row is a feature, each column is a cell, and each entry in the matrix is the number of counts of each feature in each cell. To illustrate the utility of DecontX, we will apply it to a simulated dataset. +decontX will take a matrix of counts (referred as observed counts) where each row is a feature, each column is a cell, and each entry in the matrix is the number of counts of each feature in each cell. To illustrate the utility of DecontX, we will apply it to a simulated dataset. In the function `simulateContaminatedMatrix`, the K parameter designates the number of cell clusters, the C parameter determines the number of cells, the G parameter determines the number of genes in the simulated dataset. @@ -53,8 +53,8 @@ colSums(simCounts$eta) ``` -# Decontamination using DecontX -DecontX uses bayesian method to estimate and remove contamination via varitaional inference. +# Decontamination using decontX +decontX uses bayesian method to estimate and remove contamination via varitaional inference. ```{r, warning = FALSE, message = FALSE} decontxModel <- decontX(counts = simCounts$observedCounts, z = simCounts$z) ``` From 83ff27cd234733b2b121adf1fe4d055809b8f1f8 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Tue, 23 Apr 2019 15:42:06 -0400 Subject: [PATCH 003/149] add seed to simulateContaminatedMatrix --- R/decon.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/decon.R b/R/decon.R index 30ca9550..e67f86f2 100644 --- a/R/decon.R +++ b/R/decon.R @@ -26,7 +26,10 @@ simulateContaminatedMatrix <- function(C = 300, K = 3, NRange = c(500, 1000), beta = 0.5, - delta = c(1, 2)) { + delta = c(1, 2), + seed = 428) { + + set.seed(seed) if (length(delta) == 1) { cpByC <- stats::rbeta(n = C, shape1 = delta, shape2 = delta) From f2237deef9c603c9ddc9b08f3c478b03fe918ac0 Mon Sep 17 00:00:00 2001 From: syyang Date: Thu, 25 Apr 2019 10:30:25 -0400 Subject: [PATCH 004/149] Update README.md --- README.md | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/README.md b/README.md index 6ed326d5..e53a7ff5 100755 --- a/README.md +++ b/README.md @@ -34,34 +34,6 @@ Vignettes are available in the package. An analysis example using celda with RNASeq via vignette('celda-analysis') -### Decontamination with DecontX -Highly expressed genes from various cells clusters will be expressed at low levels in other clusters in droplet-based systems due to contamination. DecontX will decompose an observed count matrix into a decontaminated expression matrix and a contamination matrix. The only other parameter needed is a vector of cell cluster labels. - -To simulate two 300 (gene) x 100 (cell) count matrices from 3 different cell types with total reads per cell ranged from 5000 to 40000: one matrix being ture expression matrix (rmat), the other matrix being contamination count matrix (cmat) -``` -sim.con = simulateContaminatedMatrix( C = 100, G = 300, K = 3, N.Range= c(5000, 40000), seed = 9124) -true.contamination.percentage = colSums( sim.con$cmat ) / colSums( sim.con$cmat + sim.con$rmat ) -str(sim.con) -# N.by.C: total transcripts per cell -# z: cell type label - -``` -Use DecontX to decompose the observed (contaminated) count matrix back into true expression matrix and a contamination matrix with specified cell label -``` -observedCounts = sim.con$observedCounts -cell.label = sim.con$z -new.counts = DecontX( counts = observedCounts, z = cell.label, max.iter = 200, seed = 123) -str(new.counts) -# Decontaminated matrix: new.counts$res.list$est.rmat -# Percentage of contamination per cell: new.counts$res.list$est.conp - -``` -DecontX Performance check -``` -estimated.contamination.percentage = new.counts$res.list$est.conp -plot( true.contamination.percentage, estimated.contamination.percentage) ; abline(0,1) -``` - ## New Features and announcements From 83c152c00ed3695055834b9b81fa4e84dd697a85 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Tue, 14 May 2019 11:30:16 -0400 Subject: [PATCH 005/149] subset rows and columns before row scaling fix #164 Also set annotation colors after subsetting --- R/celda_heatmap.R | 78 +++++++++++++++++++++++----------------------- man/plotHeatmap.Rd | 2 +- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/R/celda_heatmap.R b/R/celda_heatmap.R index a70f38b8..e3d1e7f4 100644 --- a/R/celda_heatmap.R +++ b/R/celda_heatmap.R @@ -9,7 +9,7 @@ #' NULL, no subsetting will be performed. Default NULL. #' @param cellIx Integer vector. Select cells for display in heatmap. If NULL, #' no subsetting will be performed. Default NULL. -#' @param scaleRow Function; A function to scale each individual row. Set to +#' @param scaleRow Function. A function to scale each individual row. Set to #' NULL to disable. Occurs after normalization and log transformation. Defualt #' is 'scale' and thus will Z-score transform each row. #' @param trim Numeric vector. Vector of length two that specifies the lower @@ -116,26 +116,6 @@ plotHeatmap <- function(counts, colorScheme <- match.arg(colorScheme) - if (!is.null(scaleRow)) { - if (is.function(scaleRow)) { - cn <- colnames(counts) - counts <- t(base::apply(counts, 1, scaleRow)) - colnames(counts) <- cn - } else { - stop("'scaleRow' needs to be of class 'function'") - } - } - - if (!is.null(trim)) { - if (length(trim) != 2) { - stop("'trim' should be a 2 element vector specifying the lower", - " and upper boundaries") - } - trim <- sort(trim) - counts[counts < trim[1]] <- trim[1] - counts[counts > trim[2]] <- trim[2] - } - ## Create cell annotation if (!is.null(annotationCell) & !is.null(z)) { if (is.null(rownames(annotationCell))) { @@ -173,6 +153,29 @@ plotHeatmap <- function(counts, annotationFeature <- NA } + ## Select subsets of features/cells + if (!is.null(featureIx)) { + counts <- counts[featureIx, , drop = FALSE] + if (length(annotationFeature) > 1 || + (length(annotationFeature) == 1 & !is.na(annotationFeature))) { + annotationFeature <- annotationFeature[featureIx, , drop = FALSE] + } + if (!is.null(y)) { + y <- y[featureIx] + } + } + + if (!is.null(cellIx)) { + counts <- counts[, cellIx, drop = FALSE] + if (length(annotationCell) > 1 || + (length(annotationCell) == 1 & !is.na(annotationCell))) { + annotationCell <- annotationCell[cellIx, , drop = FALSE] + } + if (!is.null(z)) { + z <- z[cellIx] + } + } + ## Set annotation colors if (!is.null(z)) { K <- sort(unique(z)) @@ -201,27 +204,25 @@ plotHeatmap <- function(counts, } } - ## Select subsets of features/cells - if (!is.null(featureIx)) { - counts <- counts[featureIx, , drop = FALSE] - if (length(annotationFeature) > 1 || - (length(annotationFeature) == 1 & !is.na(annotationFeature))) { - annotationFeature <- annotationFeature[featureIx, , drop = FALSE] - } - if (!is.null(y)) { - y <- y[featureIx] + # scale indivisual rows by scaleRow + if (!is.null(scaleRow)) { + if (is.function(scaleRow)) { + cn <- colnames(counts) + counts <- t(base::apply(counts, 1, scaleRow)) + colnames(counts) <- cn + } else { + stop("'scaleRow' needs to be of class 'function'") } } - if (!is.null(cellIx)) { - counts <- counts[, cellIx, drop = FALSE] - if (length(annotationCell) > 1 || - (length(annotationCell) == 1 & !is.na(annotationCell))) { - annotationCell <- annotationCell[cellIx, , drop = FALSE] - } - if (!is.null(z)) { - z <- z[cellIx] + if (!is.null(trim)) { + if (length(trim) != 2) { + stop("'trim' should be a 2 element vector specifying the lower", + " and upper boundaries") } + trim <- sort(trim) + counts[counts < trim[1]] <- trim[1] + counts[counts > trim[2]] <- trim[2] } ## Set color scheme and breaks @@ -244,7 +245,6 @@ plotHeatmap <- function(counts, seq(colorSchemeCenter + 1e-6, uBoundRange, length.out = colLen - round(colLen / 2))) } - } else { # Sequential color scheme if (is.null(col)) { diff --git a/man/plotHeatmap.Rd b/man/plotHeatmap.Rd index 180d681b..9cdefd74 100644 --- a/man/plotHeatmap.Rd +++ b/man/plotHeatmap.Rd @@ -25,7 +25,7 @@ features and columns represent cells. .} \item{y}{Numeric vector. Denotes feature module labels.} -\item{scaleRow}{Function; A function to scale each individual row. Set to +\item{scaleRow}{Function. A function to scale each individual row. Set to NULL to disable. Occurs after normalization and log transformation. Defualt is 'scale' and thus will Z-score transform each row.} From e27baa80d7878825e2e5bce867766db9b19d69fc Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Tue, 14 May 2019 13:51:56 -0400 Subject: [PATCH 006/149] update DESCRIPTION T # --- DESCRIPTION | 2 +- inst/NEWS | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9a8899c4..af7a2565 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: celda Title: CEllular Latent Dirichlet Allocation -Version: 1.0.1 +Version: 1.0.2 Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")), diff --git a/inst/NEWS b/inst/NEWS index b1c29680..b671aa2c 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,7 @@ +Changes in version 1.0.2 (2019-05-14): + + o Fix a bug in celdaHeatmap + Changes in version 1.0.1 (2019-05-09): o Default seed setting to maintain reproducibility From 727daa3711fee3e39177a95c6e8c81cbaaa19701 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Thu, 16 May 2019 16:08:31 -0400 Subject: [PATCH 007/149] merge devel to RELEASE_3_9 --- DESCRIPTION | 2 +- inst/NEWS | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index af7a2565..3ef7aa5a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: celda Title: CEllular Latent Dirichlet Allocation -Version: 1.0.2 +Version: 1.0.3 Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")), diff --git a/inst/NEWS b/inst/NEWS index 65bab558..755bd935 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,7 @@ +Changes in version 1.0.3 (2019-05-16): + + o Merge development branch with RELEASE_3_9 + Changes in version 1.0.2 (2019-05-14): o Fix a bug in celdaHeatmap From e041eff5909b8b6631a33b64cca4512699d8d5f0 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Mon, 20 May 2019 15:40:03 -0400 Subject: [PATCH 008/149] add MCDT --- DESCRIPTION | 2 +- inst/NEWS | 4 ++++ vignettes/buildTreeHybrid-analysis.Rmd | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b30655c6..d1e6819e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: celda Title: CEllular Latent Dirichlet Allocation -Version: 1.1.2 +Version: 1.1.3 Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")), diff --git a/inst/NEWS b/inst/NEWS index d07e0095..4527252f 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,7 @@ +Changes in version 1.1.3 (2019-05-14): + + o Add multiclass decision tree (MCDT) cell cluster annotation + Changes in version 1.1.2 (2019-05-14): o Fix a bug in celdaHeatmap diff --git a/vignettes/buildTreeHybrid-analysis.Rmd b/vignettes/buildTreeHybrid-analysis.Rmd index 514217d9..93d81c2e 100644 --- a/vignettes/buildTreeHybrid-analysis.Rmd +++ b/vignettes/buildTreeHybrid-analysis.Rmd @@ -15,7 +15,7 @@ vignette: > **CE**llular **L**atent **D**irichlet **A**llocation (celda), is among multiple approaches to cluster cells from single-cell RNA-seq data into discrete sub-populations. In most cases, annotating a concise set of important features for distinguishing these sub-populations is not a trivial task. -In this vignette we will demonstrate the implementation of a multiclass decision tree approach to simultaneously sort and annotate cell cluster label estimations by generating a sequence of univariate rules for each cluster. +In this vignette we will demonstrate the implementation of a multiclass decision tree approach to simultaneously sort and annotate cell cluster label estimations by generating a sequence of univariate rules for each cluster. The procedure has two main deviations from simple multiclass decision tree procedures. First, at each split cells from the same cluster are never separated during tree building. Instead cells from the same population are moved to one-side of a particular split based on majority voting. Second, each cluster split can be determined by one of two heuristics, as follows... From fa60334ead5727f052d7bfa7800b8812b119d2b5 Mon Sep 17 00:00:00 2001 From: ericreed Date: Wed, 22 May 2019 10:20:08 -0400 Subject: [PATCH 009/149] Replace BuildTreeHybrid with FindMarkers. This adds functionality to generate decision tree with a priori meta clusters --- NAMESPACE | 3 +- R/buildTreeHybrid.R | 128 -------- R/findMarkers.R | 278 ++++++++++++++++++ R/getDecisions.R | 6 +- R/plotDendro.R | 27 +- R/summarizeTreeHelper.R | 17 +- man/{buildTreeHybrid.Rd => findMarkers.Rd} | 60 ++-- man/getDecisions.Rd | 6 +- man/plotDendro.Rd | 12 +- ...-analysis.Rmd => FindMarkers-analysis.Rmd} | 41 +-- 10 files changed, 355 insertions(+), 223 deletions(-) delete mode 100644 R/buildTreeHybrid.R create mode 100644 R/findMarkers.R rename man/{buildTreeHybrid.Rd => findMarkers.Rd} (66%) rename vignettes/{buildTreeHybrid-analysis.Rmd => FindMarkers-analysis.Rmd} (81%) diff --git a/NAMESPACE b/NAMESPACE index 4270e170..fb123e06 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,7 +3,6 @@ export(appendCeldaList) export(availableModels) export(bestLogLikelihood) -export(buildTreeHybrid) export(celda) export(celdaGridSearch) export(celdaHeatmap) @@ -24,6 +23,7 @@ export(distinctColors) export(factorizeMatrix) export(featureModuleLookup) export(featureModuleTable) +export(findMarkers) export(geneSetEnrich) export(getDecisions) export(logLikelihood) @@ -91,6 +91,7 @@ import(grDevices) import(graphics) import(grid) import(gridExtra, except = c(combine)) +import(magrittr) import(stats, except = c(start, end)) importFrom(MAST,FromMatrix) importFrom(MAST,summary) diff --git a/R/buildTreeHybrid.R b/R/buildTreeHybrid.R deleted file mode 100644 index 614be6f2..00000000 --- a/R/buildTreeHybrid.R +++ /dev/null @@ -1,128 +0,0 @@ -#' @title Generate decision tree from single-cell clustering output. -#' @description Uses decision tree procudure to generate a set of rules for each -#' cell cluster defined by a single-cell clustering. Splits are determined by -#' one of two metrics at each split: a one-off metric to determine rules for -#' identifying clusters by a single feature, and a balanced metric to determine -#' rules for identifying sets of similar clusters. -#' @param features A L(features) by N(samples) numeric matrix. -#' @param class A vector of K label assignemnts. -#' @param oneoffMetric A character string. What one-off metric to run, either -#' `modified F1` or `pairwise AUC`. -#' @param threshold A numeric value. The threshold for the oneoff metric to use -#' between 0 and 1, 0.95 by default. Smaller values will result is more one-off -#' splits. -#' @param reuseFeatures Logical. Whether or not a feature can be used more than -#' once on the same cluster. Default is TRUE. -#' @param altSplit Logical. Whether or not to force a marker for clusters that -#' are solely defined by the absence of markers. Defulault is TRUE -#' @param consecutiveOneoff Logical. Whether or not to allow one-off splits at -#' consecutive brances. Default it TRUE -#' @return A named list with five elements. -#' \itemize{ -#' \item rules - A named list with one `data.frame` for every label. Each -#' `data.frame` has five columns and gives the set of rules for disinguishing -#' each label. -#' \itemize{ -#' \item feature - Feature identifier. -#' \item direction - Relationship to feature value, -1 if less than, 1 if -#' greater than. -#' \item value - The feature value which defines the decision boundary -#' \item stat - The performance value returned by the splitting metric for -#' this split. -#' \item statUsed - Which performance metric was used. "IG" if information -#' gain and "OO" if one-off. -#' \item level - The level of the tree at which is rule was defined. 1 is the -#' level of the first split of the tree. -#' } -#' \item dendro - A dendrogram object of the decision tree output -#' \item summaryMatrix - A K(labels) by L(features) matrix representation of -#' the decision rules. Columns denote features and rows denote labels. Non-0 -#' values denote instances where a feature was used on a given label. Positive -#' and negative values denote whether the values of the label for that feature -#' were greater than or less than the decision threshold, respectively. The -#' magnitude of Non-0 values denote the level at which the feature was used, -#' where the first split has a magnitude of 1. Note, if reuse_features = TRUE, -#' only the final usage of a feature for a given label is shown. -#' \item prediction - A character vector of label of predictions of the -#' training data using the final model. "MISSING" if label prediction was -#' ambiguous. -#' \item performance - A named list denoting the training performance of the -#' model. -#' \itemize{ -#' \item accuracy - (number correct/number of samples) for the whole set of -#' samples. -#' \item balAcc - mean sensitivity across all labels -#' \item meanPrecision - mean precision across all labels -#' \item correct - the number of correct predictions of each label -#' \item sizes - the number of actual counts of each label -#' \item sensitivity - the sensitivity of the prediciton of each label. -#' \item precision - the precision of the prediciton of each label. -#' } -#' } -#' @examples -#' library(M3DExampleData) -#' counts <- M3DExampleData::Mmus_example_list$data -#' # Subset 500 genes for fast clustering -#' counts <- as.matrix(counts[1501:2000, ]) -#' # Cluster genes ans samples each into 10 modules -#' cm <- celda_CG(counts = counts, L = 10, K = 5, verbose = FALSE) -#' # Get features matrix and cluster assignments -#' factorized <- factorizeMatrix(counts, cm) -#' features <- factorized$proportions$cell -#' class <- clusters(cm)$z -#' # Generate Decision Tree -#' DecTree <- buildTreeHybrid(features, -#' class, -#' oneoffMetric = "modified F1", -#' threshold = 1, -#' consecutiveOneoff = FALSE) -#' -#' # Plot dendrogram -#' plotDendro(DecTree) -#' @export -buildTreeHybrid <- function(features, - class, - oneoffMetric = c("modified F1", "pairwise AUC"), - threshold = 0.95, - reuseFeatures = FALSE, - altSplit = TRUE, - consecutiveOneoff = TRUE) { - - if (ncol(features) != length(class)) { - stop("Number of columns of features must equal length of class") - } - if (any(is.na(class))) { - stop("NA class values") - } - if (any(is.na(features))) { - stop("NA feature values") - } - - # Match the oneoffMetric argument - oneoffMetric <- match.arg(oneoffMetric) - - # Transpose features - features <- t(features) - - # Set class to factor - class <- as.factor(class) - - # Generate list of tree levels - tree <- .generateTreeList( - features, - class, - oneoffMetric, - threshold, - reuseFeatures, - consecutiveOneoff) - - # Add alternative node for the solely down-regulated leaf - if (altSplit) { - tree <- .addAlternativeSplit(tree, features, class) - } - - # Format tree output for plotting and generate summary statistics - DTsummary <- .summarizeTree(tree, features, class) - - return(DTsummary) -} diff --git a/R/findMarkers.R b/R/findMarkers.R new file mode 100644 index 00000000..6ea35aaa --- /dev/null +++ b/R/findMarkers.R @@ -0,0 +1,278 @@ +#' @title Generate decision tree from single-cell clustering output. +#' @description Uses decision tree procudure to generate a set of rules for each +#' cell cluster defined by a single-cell clustering. Splits are determined by +#' one of two metrics at each split: a one-off metric to determine rules for +#' identifying clusters by a single feature, and a balanced metric to determine +#' rules for identifying sets of similar clusters. +#' @param features A L(features) by N(samples) numeric matrix. +#' @param class A vector of K label assignemnts. +#' @param cellTypes List where each element is a cell type and all the clusters within that cell type (i.e. subtypes). +#' @param oneoffMetric A character string. What one-off metric to run, either `modified F1` or `pairwise AUC`. +#' @param threshold A numeric value. The threshold for the oneoff metric to use between 0 and 1, 0.95 by default. Smaller values will result is more one-off splits. +#' @param reuseFeatures Logical. Whether or not a feature can be used more than once on the same cluster. Default is TRUE. +#' @param altSplit Logical. Whether or not to force a marker for clusters that are solely defined by the absence of markers. Defsult is TRUE +#' @param consecutiveOneoff Logical. Whether or not to allow one-off splits at consecutive brances. Default is TRUE +#' @return A named list with five elements. +#' \itemize{ +#' \item rules - A named list with one `data.frame` for every label. Each +#' `data.frame` has five columns and gives the set of rules for disinguishing each label. +#' \itemize{ +#' \item feature - Feature identifier. +#' \item direction - Relationship to feature value, -1 if less than, 1 if greater than. +#' \item value - The feature value which defines the decision boundary +#' \item stat - The performance value returned by the splitting metric for this split. +#' \item statUsed - Which performance metric was used. "IG" if information gain and "OO" if one-off. +#' \item level - The level of the tree at which is rule was defined. 1 is the level of the first split of the tree. +#' } +#' \item dendro - A dendrogram object of the decision tree output +#' \item prediction - A character vector of label of predictions of the training data using the final model. "MISSING" if label prediction was ambiguous. +#' \item performance - A named list denoting the training performance of the model. +#' \itemize{ +#' \item accuracy - (number correct/number of samples) for the whole set of +#' samples. +#' \item balAcc - mean sensitivity across all labels +#' \item meanPrecision - mean precision across all labels +#' \item correct - the number of correct predictions of each label +#' \item sizes - the number of actual counts of each label +#' \item sensitivity - the sensitivity of the prediciton of each label. +#' \item precision - the precision of the prediciton of each label. +#' } +#' } +#' @examples +#' library(M3DExampleData) +#' counts <- M3DExampleData::Mmus_example_list$data +#' # Subset 500 genes for fast clustering +#' counts <- as.matrix(counts[1501:2000, ]) +#' # Cluster genes ans samples each into 10 modules +#' cm <- celda_CG(counts = counts, L = 10, K = 5, verbose = FALSE) +#' # Get features matrix and cluster assignments +#' factorized <- factorizeMatrix(counts, cm) +#' features <- factorized$proportions$cell +#' class <- clusters(cm)$z +#' # Generate Decision Tree +#' decTree <- findMarkers(features, +#' class, +#' oneoffMetric = "modified F1", +#' threshold = 1, +#' consecutiveOneoff = FALSE) +#' +#' # Plot dendrogram +#' plotDendro(decTree) +#' @import magrittr +#' @export +findMarkers <- function(features, + class, + cellTypes, + oneoffMetric = c("modified F1", "pairwise AUC"), + threshold = 0.95, + reuseFeatures = FALSE, + altSplit = TRUE, + consecutiveOneoff = TRUE) { + + if (ncol(features) != length(class)) + stop("Number of columns of features must equal length of class") + + if (any(is.na(class))) + stop("NA class values") + + if (any(is.na(features))) + stop("NA feature values") + + # Match the oneoffMetric argument + oneoffMetric <- match.arg(oneoffMetric) + + # Transpose features + features <- t(features) + + #If no detailed cell types are provided + if(!hasArg(cellTypes)){ + + print('Building tree...') + + # Set class to factor + class <- as.factor(class) + + # Generate list of tree levels + tree <- .generateTreeList( + features, + class, + oneoffMetric, + threshold, + reuseFeatures, + consecutiveOneoff) + + # Add alternative node for the solely down-regulated leaf + if (altSplit) + tree <- .addAlternativeSplit(tree, features, class) + + print('Computing performance metrics...') + + # Format tree output for plotting and generate summary statistics + DTsummary <- .summarizeTree(tree, features, class) + + return(DTsummary) + } + + # If detailed cell types are provided + else{ + + # Check that cell types match class labels + if(mean(unlist(cellTypes) %in% unique(class)) != 1) + stop("Provided cell types and class labels do not match. Please check the 'cellTypes' argument.") + + # Create vector with cell type class labels + newLabels <- class + for (i in names(cellTypes)) { + newLabels[newLabels %in% cellTypes[[i]]] <- i + } + + # Update cell subtype labels + subtypeLabels <- newLabels + subtypeLabels[subtypeLabels %in% names(cellTypes)] <- paste0( + subtypeLabels[subtypeLabels %in% names(cellTypes)], + "(", + class[subtypeLabels %in% names(cellTypes)], + ")" + ) + + ## Create tree for cell types ## + print('Building tree for all cell types...') + tree <- .generateTreeList(features, as.factor(newLabels), oneoffMetric, threshold, reuseFeatures, consecutiveOneoff) + tree <- list( + rules = .mapClass2features(tree, features, as.factor(newLabels))$rules, + dendro = .convertToDendrogram(tree, as.factor(newLabels)) + ) + + #Store tree's dendrogram in a separate variable + dendro <- tree$dendro + + #Find which cell types have more than one cluster + largeCellTypes <- names(cellTypes[lengths(cellTypes) > 1]) + + #Create separate trees for each cell type with more than one cluster + newTrees <- lapply(unique(newLabels), function(cellType){ + + #Create tree for this cell type only + if (cellType %in% largeCellTypes) { + + # Print current status + print(paste('Building tree for cell type:', cellType)) + + # Remove used features + featUse <- colnames(features) + if (!reuseFeatures) { + featUse <- featUse[!featUse %in% tree$rules[[cellType]]$feature] + } + + # Create new Tree + newTree <- .generateTreeList(features[newLabels == cellType, featUse], as.factor(subtypeLabels[newLabels == cellType]), oneoffMetric, threshold, reuseFeatures, consecutiveOneoff) + newTree <- list( + rules = .mapClass2features(newTree, features[newLabels == cellType,], as.factor(subtypeLabels[newLabels == cellType]))$rules, + dendro = .convertToDendrogram(newTree, as.factor(subtypeLabels[newLabels == cellType])) + ) + + #Adjust 'rules' table for new tree + newTree$rules <- lapply(newTree$rules, function(rules){ + rules$level <- rules$level + max(tree$rules[[cellType]]$level) + rules <- rbind(tree$rules[[cellType]], rules) + }) + + return(newTree) + } + else{ + + # Adjust name of class if it contains only one cluster + names(tree$rules)[which(names(tree$rules) == cellType)] <<- paste0(cellType,'(', unlist(cellTypes[cellType]) ,')') + return() + } + }) + #Remove empty trees for single-cluster cell types + newTrees <- newTrees[lengths(newTrees) > 0] + names(newTrees) <- unique(newLabels)[unique(newLabels) %in% largeCellTypes] + + #Find indices of cell type nodes in tree + indices <- lapply(unique(newLabels)[unique(newLabels) %in% largeCellTypes], function(cellType) { + # Initialize sub trees, indices string, and flag + dendSub <- dendro + index <- "" + flag <- TRUE + + while (flag) { + # Get the edge with the class of interest + whEdge <- which(unlist(lapply(dendSub, function(edge) cellType %in% attributes(edge)$classLabels))) + + # Add this as a string + index <- paste0(index, "[[", whEdge, "]]") + + # Move to this branch + dendSub <- eval(parse(text = paste0("dendro", index))) + + # Is this the only class in that branch + flag <- length(attributes(dendSub)$classLabels) > 1 + } + + return(index) + }) + names(indices) <- unique(newLabels)[unique(newLabels) %in% largeCellTypes] + + #Add each cell type tree + for(cellType in names(newTrees)){ + + #Get current tree + cellTypeDendro <- newTrees[[cellType]]$dendro + + #Nudge nodes upward to make room for new tree + dendro <- dendrapply(dendro, function(node) { + if(attributes(node)$height > 1) + attributes(node)$height <- attributes(node)$height + attributes(cellTypeDendro)$height - 1 + return(node) + }) + + #Adjust labels, member count, and midpoint of nodes + dendro <- dendrapply(dendro, function(node){ + #Check if in right branch + if(cellType %in% as.character(attributes(node)$classLabels)){ + #assign new labels replacing old one (i.e. replace cell type label with subtype labels) + attributes(node)$classLabels <- as.character(attributes(node)$classLabels) %>% + .[. != cellType] %>% + c(., unique(subtypeLabels)[grep(cellType, unique(subtypeLabels))]) + + #assign new member count for this branch + attributes(node)$members <- length(attributes(node)$classLabels) + + #assign new midpoint for this branch + attributes(node)$midpoint <- (attributes(node)$members - 1) / 2 + } + return(node) + }) + + #Replace label at new tree's branch point + branchPointLabel <- attributes(eval(parse(text = paste0("dendro", indices[[cellType]]))))$label + branchPointStatUsed <- attributes(eval(parse(text = paste0("dendro", indices[[cellType]]))))$statUsed + if(!is.null(branchPointLabel)) { + attributes(cellTypeDendro)$label <- branchPointLabel + attributes(cellTypeDendro)$statUsed <- branchPointStatUsed + } + + #Add new tree to original tree + eval(parse(text = paste0("dendro", indices[[cellType]], " <- cellTypeDendro"))) + + #Append new tree's 'rules' tables to original tree + tree$rules <- append(tree$rules, newTrees[[cellType]]$rules) + + #Remove old tree's rules + tree$rules <- tree$rules[-which(names(tree$rules) == cellType)] + } + + #Set final tree dendro + tree$dendro <- dendro + + #Get performance metrics + print('Computing performance metrics...') + perfList <- .getPerformance(tree$rules, features, as.factor(subtypeLabels)) + tree$prediction <- perfList$prediction + tree$performance <- perfList$performance + + return(tree) + } +} diff --git a/R/getDecisions.R b/R/getDecisions.R index e5611cc6..118be596 100644 --- a/R/getDecisions.R +++ b/R/getDecisions.R @@ -1,8 +1,8 @@ #' @title Gets cluster estimates using rules generated by -#' `celda::buildTreeHybrid` +#' `celda::findMarkers` #' @description Get decisions for a matrix of features. Estimate cell #' cluster membership using feature matrix input. -#' @param rules List object. The `rules` element from `buildTreeHybrid` +#' @param rules List object. The `rules` element from `findMarkers` #' output. Returns NA if cluster estimation was ambiguous. #' @param features A L(features) by N(samples) numeric matrix. #' @return A character vector of label predicitions. @@ -18,7 +18,7 @@ #' features <- factorized$proportions$cell #' class <- clusters(cm)$z #' # Generate Decision Tree -#' DecTree <- buildTreeHybrid(features, +#' DecTree <- findMarkers(features, #' class, #' oneoffMetric = "modified F1", #' threshold = 1, diff --git a/R/plotDendro.R b/R/plotDendro.R index f189c59e..be4a85fa 100644 --- a/R/plotDendro.R +++ b/R/plotDendro.R @@ -1,11 +1,11 @@ -#' @title Plots dendrogram of `buildTreeHybrid` output +#' @title Plots dendrogram of `findMarkers` output #' @description Generates a dendrogram of the rules and performance -#' (optional) of the decision tree generates by `buildTreeHybrid`. -#' @param decisionTree List object. The output of `celda::buildTreeHybrid`. +#' (optional) of the decision tree generates by `findMarkers`. +#' @param decisionTree List object. The output of `celda::findMarkers`. #' @param classLabel A character value. The name of a label to which draw #' the path and rules. If NULL (default), the rules for every cluster is shown. #' @param addSensPrec Logical. Print training sensitivities and precisions -#' for each cluster below leaf label? Default is TRUE. +#' for each cluster below leaf label? Default is FALSE. #' @param leafSize A numeric value. Size of text below each leaf. Default is 24. #' @param boxSize A numeric value. Size of rule labels. Default is 7. #' @param boxColor A character value. Color of rule labels. Default is `black`. @@ -21,7 +21,7 @@ #' features <- factorized$proportions$cell #' class <- clusters(cm)$z #' # Generate Decision Tree -#' decTree <- buildTreeHybrid(features, +#' decTree <- findMarkers(features, #' class, #' oneoffMetric = "modified F1", #' threshold = 1, @@ -36,7 +36,7 @@ #' @export plotDendro <- function(decisionTree, classLabel = NULL, - addSensPrec = TRUE, + addSensPrec = FALSE, leafSize = 24, boxSize = 7, boxColor = "black") { @@ -82,7 +82,7 @@ plotDendro <- function(decisionTree, # If highlighting a class label, remove non-class specific rules if (!is.null(classLabel)) { - if (!classLabel %in% rownames(decisionTree$summaryMatrix)) { + if (!classLabel %in% names(decisionTree$rules)) { stop("classLabel not a valid class ID.") } dendro <- .highlightClassLabel(dendro, classLabel) @@ -127,6 +127,13 @@ plotDendro <- function(decisionTree, # Add sensitivity and precision measurements if (addSensPrec) { leafLabels <- paste(leafLabels, perfVec[leafLabels], sep = "\n") + leafAngle <- 0 + leafHJust <- 0.5 + leafVJust <- -1 + } else { + leafAngle <- 90 + leafHJust <- 1 + leafVJust <- 0.5 } # Create plot of dendrogram @@ -153,10 +160,12 @@ plotDendro <- function(decisionTree, panel.border = ggplot2::element_blank(), axis.title = ggplot2::element_blank(), axis.ticks = ggplot2::element_blank(), - axis.text.x = ggplot2::element_text(hjust = 0.5, + axis.text.x = ggplot2::element_text( + hjust = leafHJust, + angle = leafAngle, size = leafSize, family = "mono", - vjust = -1), + vjust = leafVJust), axis.text.y = ggplot2::element_blank() )) diff --git a/R/summarizeTreeHelper.R b/R/summarizeTreeHelper.R index c0c0a125..0a12b48b 100644 --- a/R/summarizeTreeHelper.R +++ b/R/summarizeTreeHelper.R @@ -13,7 +13,6 @@ return(list( rules = class2features$rules, dendro = dendro, - summaryMatrix = class2features$c2fMatrix, prediction = perfList$prediction, performance = perfList$performance )) @@ -265,14 +264,9 @@ subUnderscore <- function(x, n) unlist(lapply( )) } -# Create matrix of classes and features combinations +# Create rules of classes and features sequences .mapClass2features <- function(tree, features, class) { - # Create empty matrix - c2fMatrix <- matrix(0, nrow = length(unique(class)), ncol = ncol(features)) - rownames(c2fMatrix) <- sort(unique(class)) - colnames(c2fMatrix) <- colnames(features) - # Get class to feature indices class2featuresIndices <- do.call(rbind, lapply( seq(length(tree)), @@ -320,14 +314,6 @@ subUnderscore <- function(x, n) unlist(lapply( })) rownames(class2featuresIndices) <- NULL - # Add levels to matrix - for (i in seq(nrow(class2featuresIndices))) { - c2fMatrix[class2featuresIndices[i, "class"], - class2featuresIndices[i, "feature"]] <- - class2featuresIndices[i, "level"] * - class2featuresIndices[i, "direction"] - } - # Generate list of rules for each class rules <- lapply(levels(class), function(cl, class2featuresIndices) { @@ -338,6 +324,5 @@ subUnderscore <- function(x, n) unlist(lapply( names(rules) <- levels(class) return(list( - c2fMatrix = c2fMatrix, rules = rules)) } diff --git a/man/buildTreeHybrid.Rd b/man/findMarkers.Rd similarity index 66% rename from man/buildTreeHybrid.Rd rename to man/findMarkers.Rd index 6c9528d2..a01e3d76 100644 --- a/man/buildTreeHybrid.Rd +++ b/man/findMarkers.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/buildTreeHybrid.R -\name{buildTreeHybrid} -\alias{buildTreeHybrid} +% Please edit documentation in R/findMarkers.R +\name{findMarkers} +\alias{findMarkers} \title{Generate decision tree from single-cell clustering output.} \usage{ -buildTreeHybrid(features, class, oneoffMetric = c("modified F1", +findMarkers(features, class, cellTypes, oneoffMetric = c("modified F1", "pairwise AUC"), threshold = 0.95, reuseFeatures = FALSE, altSplit = TRUE, consecutiveOneoff = TRUE) } @@ -13,54 +13,34 @@ buildTreeHybrid(features, class, oneoffMetric = c("modified F1", \item{class}{A vector of K label assignemnts.} -\item{oneoffMetric}{A character string. What one-off metric to run, either -`modified F1` or `pairwise AUC`.} +\item{cellTypes}{List where each element is a cell type and all the clusters within that cell type (i.e. subtypes).} -\item{threshold}{A numeric value. The threshold for the oneoff metric to use -between 0 and 1, 0.95 by default. Smaller values will result is more one-off -splits.} +\item{oneoffMetric}{A character string. What one-off metric to run, either `modified F1` or `pairwise AUC`.} -\item{reuseFeatures}{Logical. Whether or not a feature can be used more than -once on the same cluster. Default is TRUE.} +\item{threshold}{A numeric value. The threshold for the oneoff metric to use between 0 and 1, 0.95 by default. Smaller values will result is more one-off splits.} -\item{altSplit}{Logical. Whether or not to force a marker for clusters that -are solely defined by the absence of markers. Defulault is TRUE} +\item{reuseFeatures}{Logical. Whether or not a feature can be used more than once on the same cluster. Default is TRUE.} -\item{consecutiveOneoff}{Logical. Whether or not to allow one-off splits at -consecutive brances. Default it TRUE} +\item{altSplit}{Logical. Whether or not to force a marker for clusters that are solely defined by the absence of markers. Defsult is TRUE} + +\item{consecutiveOneoff}{Logical. Whether or not to allow one-off splits at consecutive brances. Default is TRUE} } \value{ A named list with five elements. \itemize{ \item rules - A named list with one `data.frame` for every label. Each -`data.frame` has five columns and gives the set of rules for disinguishing - each label. +`data.frame` has five columns and gives the set of rules for disinguishing each label. \itemize{ \item feature - Feature identifier. - \item direction - Relationship to feature value, -1 if less than, 1 if - greater than. + \item direction - Relationship to feature value, -1 if less than, 1 if greater than. \item value - The feature value which defines the decision boundary - \item stat - The performance value returned by the splitting metric for - this split. - \item statUsed - Which performance metric was used. "IG" if information - gain and "OO" if one-off. - \item level - The level of the tree at which is rule was defined. 1 is the - level of the first split of the tree. + \item stat - The performance value returned by the splitting metric for this split. + \item statUsed - Which performance metric was used. "IG" if information gain and "OO" if one-off. + \item level - The level of the tree at which is rule was defined. 1 is the level of the first split of the tree. } \item dendro - A dendrogram object of the decision tree output - \item summaryMatrix - A K(labels) by L(features) matrix representation of - the decision rules. Columns denote features and rows denote labels. Non-0 - values denote instances where a feature was used on a given label. Positive - and negative values denote whether the values of the label for that feature - were greater than or less than the decision threshold, respectively. The - magnitude of Non-0 values denote the level at which the feature was used, - where the first split has a magnitude of 1. Note, if reuse_features = TRUE, - only the final usage of a feature for a given label is shown. - \item prediction - A character vector of label of predictions of the - training data using the final model. "MISSING" if label prediction was - ambiguous. - \item performance - A named list denoting the training performance of the - model. + \item prediction - A character vector of label of predictions of the training data using the final model. "MISSING" if label prediction was ambiguous. + \item performance - A named list denoting the training performance of the model. \itemize{ \item accuracy - (number correct/number of samples) for the whole set of samples. @@ -92,12 +72,12 @@ factorized <- factorizeMatrix(counts, cm) features <- factorized$proportions$cell class <- clusters(cm)$z # Generate Decision Tree -DecTree <- buildTreeHybrid(features, +decTree <- findMarkers(features, class, oneoffMetric = "modified F1", threshold = 1, consecutiveOneoff = FALSE) # Plot dendrogram -plotDendro(DecTree) +plotDendro(decTree) } diff --git a/man/getDecisions.Rd b/man/getDecisions.Rd index f4289348..b9577534 100644 --- a/man/getDecisions.Rd +++ b/man/getDecisions.Rd @@ -3,12 +3,12 @@ \name{getDecisions} \alias{getDecisions} \title{Gets cluster estimates using rules generated by - `celda::buildTreeHybrid`} + `celda::findMarkers`} \usage{ getDecisions(rules, features) } \arguments{ -\item{rules}{List object. The `rules` element from `buildTreeHybrid` +\item{rules}{List object. The `rules` element from `findMarkers` output. Returns NA if cluster estimation was ambiguous.} \item{features}{A L(features) by N(samples) numeric matrix.} @@ -32,7 +32,7 @@ factorized <- factorizeMatrix(counts, cm) features <- factorized$proportions$cell class <- clusters(cm)$z # Generate Decision Tree -DecTree <- buildTreeHybrid(features, +DecTree <- findMarkers(features, class, oneoffMetric = "modified F1", threshold = 1, diff --git a/man/plotDendro.Rd b/man/plotDendro.Rd index bf01daba..157d38e6 100644 --- a/man/plotDendro.Rd +++ b/man/plotDendro.Rd @@ -2,19 +2,19 @@ % Please edit documentation in R/plotDendro.R \name{plotDendro} \alias{plotDendro} -\title{Plots dendrogram of `buildTreeHybrid` output} +\title{Plots dendrogram of `findMarkers` output} \usage{ -plotDendro(decisionTree, classLabel = NULL, addSensPrec = TRUE, +plotDendro(decisionTree, classLabel = NULL, addSensPrec = FALSE, leafSize = 24, boxSize = 7, boxColor = "black") } \arguments{ -\item{decisionTree}{List object. The output of `celda::buildTreeHybrid`.} +\item{decisionTree}{List object. The output of `celda::findMarkers`.} \item{classLabel}{A character value. The name of a label to which draw the path and rules. If NULL (default), the rules for every cluster is shown.} \item{addSensPrec}{Logical. Print training sensitivities and precisions -for each cluster below leaf label? Default is TRUE.} +for each cluster below leaf label? Default is FALSE.} \item{leafSize}{A numeric value. Size of text below each leaf. Default is 24.} @@ -27,7 +27,7 @@ A ggplot2 object } \description{ Generates a dendrogram of the rules and performance -(optional) of the decision tree generates by `buildTreeHybrid`. +(optional) of the decision tree generates by `findMarkers`. } \examples{ library(M3DExampleData) @@ -41,7 +41,7 @@ factorized <- factorizeMatrix(counts, cm) features <- factorized$proportions$cell class <- clusters(cm)$z # Generate Decision Tree -decTree <- buildTreeHybrid(features, +decTree <- findMarkers(features, class, oneoffMetric = "modified F1", threshold = 1, diff --git a/vignettes/buildTreeHybrid-analysis.Rmd b/vignettes/FindMarkers-analysis.Rmd similarity index 81% rename from vignettes/buildTreeHybrid-analysis.Rmd rename to vignettes/FindMarkers-analysis.Rmd index 514217d9..d47860b8 100644 --- a/vignettes/buildTreeHybrid-analysis.Rmd +++ b/vignettes/FindMarkers-analysis.Rmd @@ -63,7 +63,7 @@ cm <- celda_CG(counts = counts, L = 10, K = 10, verbose = FALSE) # Format celda output for decision tree generation -The decision trees are generated and evaluated by `buildTreeHybrid`. `buildTreeHybrid` requires two arguments: **features** and **class**. **features** is a numeric matrix with a row for every variable (or feature) to be used in sorting and a column for every sample. In this example we will use a factorized matrix of the gene clusters as our feature matrix (Check `?factorizedMatrix` for more details). **class** is a vector with a label assignment for every sample. Note, that neither **features** nor **class** may have any missing values. +The decision trees are generated and evaluated by `findMarkers`. `findMarkers` requires two arguments: **features** and **class**. **features** is a numeric matrix with a row for every variable (or feature) to be used in sorting and a column for every sample. In this example we will use a factorized matrix of the gene clusters as our feature matrix (Check `?factorizedMatrix` for more details). **class** is a vector with a label assignment for every sample. Note, that neither **features** nor **class** may have any missing values. ```{r, eval = TRUE, message = FALSE} # Get features matrix and cluster assignments @@ -74,7 +74,7 @@ class <- clusters(cm)$z # Generate celda decision tree -`buildTreeHybrid` allows for different parameter options. The optimal combination of these parameters is generally analysis specific and may require some trial and error. Parameters include: +`findMarkers` allows for different parameter options. The optimal combination of these parameters is generally analysis specific and may require some trial and error. Parameters include: - **oneoffMetric** A selection for one of two possible metrics for evaluating good one-off splits for segregating a single cluster from every other cluster based on up-regulation of that cluster alone. The performance of either is a value between 0 and 1. The two options include: - *modified F1* - Finds the univariate split that maximizes the harmonic mean of the sensitivity and precision of the up-regulated cluster, as well as the minimum cluster-specific sensitivity across all down-regulated clusters. @@ -86,7 +86,7 @@ class <- clusters(cm)$z ```{r, eval = TRUE, message = FALSE} -DecTree <- buildTreeHybrid(features, +DecTree <- findMarkers(features, class, oneoffMetric = "modified F1", threshold = 0.95, @@ -95,9 +95,9 @@ DecTree <- buildTreeHybrid(features, consecutiveOneoff = FALSE) ``` -## `buildTreeHybrid` output +## `findMarkers` output -The `buildTreeHybrid` output is a named list of five elements +The `findMarkers` output is a named list of four elements - **rules** A named list with one `data.frame` for every label. Each `data.frame` has five columns and gives the set of rules for distinguishing @@ -114,17 +114,6 @@ The `buildTreeHybrid` output is a named list of five elements level of the first split of the tree. - **dendro** A dendrogram object of the decision tree output -- **summaryMatrix** A K(labels) by L(features) matrix representation of - the decision rules. Columns denote features and rows denote labels. Non-0 - values denote instances where a feature was used on a given label. Positive - and negative values denote whether the values of the label for that feature - were greater than or less than the decision threshold, respectively. The - magnitude of Non-0 values denote the level at which the feature was used, - where the first split has a magnitude of 1. Note, if reuse_features = TRUE, - only the final usage of a feature for a given label is shown. - \item prediction** A character vector of label of predictions of the - training data using the final model. "MISSING" if label prediction was - ambiguous. - **performance** A named list denoting the training performance of the - *accuracy* - (number correct/number of samples) for the whole set of @@ -165,12 +154,30 @@ plotDendro(DecTree, # Get label estimates from features matrix -`buildTreeHybrid` performs label estimation of each sample in the training set automatically. You can use the set of rules generated by `buildTreeHybrid` to predict the labels on an independent feature matrix using `getDecisions`. +`findMarkers` performs label estimation of each sample in the training set automatically. You can use the set of rules generated by `findMarkers` to predict the labels on an independent feature matrix using `getDecisions`. ```{r, eval = TRUE, message = FALSE} head(getDecisions(DecTree$rules, features)) ``` +# Create decision tree with meta clusters + +If you have a priori understanding of sub-groups of your cluster labels, you can ensure that these sub-groups are not separated up-stream in the tree by using the optional *cellTypes* argument. This is just a named list of labels in your *class* vector. For example, if we knew that clusters, 4, 5, and 1 were of the same subtype of clusters, we could do the following... + +```{r, eval = TRUE, message = FALSE} +# Run with a hierarchichal split +cellTypes <- list(metaLabel = c("4", "5", "1")) +DecTreeMeta <- findMarkers(features, + class, + cellTypes, + oneoffMetric = "modified F1", + threshold = 1, + reuseFeatures = F, + consecutiveOneoff = FALSE) +plotDendro(DecTreeMeta) +``` + + # Session Information ```{r} From ccd30bdbbdb41b06f5eb2d794421647bc252f68d Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Wed, 22 May 2019 22:36:34 -0400 Subject: [PATCH 010/149] Update R-3.5 version --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 70522e99..795fbc87 100755 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ library(devtools) install_github("campbio/celda") ``` -For `R-3.5` users, please install from the `R_3_5` branch. This version of **celda** is identical to the most recent release of **celda** (`master` branch) except it also works on `R-3.5`. **NOTE:** This branch is no longer updated. Please use `R-3.6` versions. +For `R-3.5` users, please install from the `R_3_5` branch. This version of **celda** is identical to the most recent release of **celda** (`master` branch) except it also works on `R-3.5`. ``` library(devtools) install_github("campbio/celda@R_3_5") From 63429aa421b81727896b851f31ccf5f6c92bde61 Mon Sep 17 00:00:00 2001 From: Zhe Wang Date: Thu, 23 May 2019 12:58:18 -0400 Subject: [PATCH 011/149] Update README.md --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 795fbc87..9bed5990 100755 --- a/README.md +++ b/README.md @@ -55,4 +55,9 @@ The vignette in HTML format showing how to use **celda** is available on Biocond Example vignette of doing single-cell RNA-seq data decontamination using DecontX is available [here](http://bioconductor.org/packages/release/bioc/vignettes/celda/inst/doc/DecontX-analysis.html). ## For developers -Check out our [Wiki](https://github.com/campbio/celda/wiki) for [coding style guide](https://github.com/campbio/celda/wiki/Celda-Development-Coding-Style-Guide) if you want to contribute! +Check out our [Wiki](https://github.com/campbio/celda/wiki) for developer's guide if you want to contribute! +- [Celda Development Coding Style Guide](https://github.com/campbio/celda/wiki/Celda-Development-Coding-Style-Guide) +- [Celda Development Robust and Efficient Code](https://github.com/campbio/celda/wiki/Celda-Development-Robust-and-Efficient-Code) +- [Celda Development Rstudio configuration](https://github.com/campbio/celda/wiki/Celda-Development-Rstudio-configuration) +- [FAQ on how to use celda](https://github.com/campbio/celda/wiki/FAQ-on-how-to-use-celda) +- [FAQ on package development](https://github.com/campbio/celda/wiki/FAQ-on-package-development) From bd472c2360349c2696bb23d63bfff800bf401d97 Mon Sep 17 00:00:00 2001 From: ericreed Date: Fri, 24 May 2019 15:13:31 -0400 Subject: [PATCH 012/149] Style edits --- R/findMarkers.R | 192 +++++++++++++++++++++++++++++------------------- 1 file changed, 117 insertions(+), 75 deletions(-) diff --git a/R/findMarkers.R b/R/findMarkers.R index 6ea35aaa..b75a325a 100644 --- a/R/findMarkers.R +++ b/R/findMarkers.R @@ -6,27 +6,50 @@ #' rules for identifying sets of similar clusters. #' @param features A L(features) by N(samples) numeric matrix. #' @param class A vector of K label assignemnts. -#' @param cellTypes List where each element is a cell type and all the clusters within that cell type (i.e. subtypes). -#' @param oneoffMetric A character string. What one-off metric to run, either `modified F1` or `pairwise AUC`. -#' @param threshold A numeric value. The threshold for the oneoff metric to use between 0 and 1, 0.95 by default. Smaller values will result is more one-off splits. -#' @param reuseFeatures Logical. Whether or not a feature can be used more than once on the same cluster. Default is TRUE. -#' @param altSplit Logical. Whether or not to force a marker for clusters that are solely defined by the absence of markers. Defsult is TRUE -#' @param consecutiveOneoff Logical. Whether or not to allow one-off splits at consecutive brances. Default is TRUE +#' @param oneoffMetric A character string. What one-off metric to run, either +#' `modified F1` or `pairwise AUC`. +#' @param cellTypes List where each element is a cell type and all the clusters +#' within that cell type (i.e. subtypes). +#' @param threshold A numeric value. The threshold for the oneoff metric to use +#' between 0 and 1, 0.95 by default. Smaller values will result is more one-off +#' splits. +#' @param reuseFeatures Logical. Whether or not a feature can be used more than +#' once on the same cluster. Default is TRUE. +#' @param altSplit Logical. Whether or not to force a marker for clusters that +#' are solely defined by the absence of markers. Defulault is TRUE +#' @param consecutiveOneoff Logical. Whether or not to allow one-off splits at +#' consecutive brances. Default it TRUE #' @return A named list with five elements. #' \itemize{ #' \item rules - A named list with one `data.frame` for every label. Each -#' `data.frame` has five columns and gives the set of rules for disinguishing each label. +#' `data.frame` has five columns and gives the set of rules for disinguishing +#' each label. #' \itemize{ #' \item feature - Feature identifier. -#' \item direction - Relationship to feature value, -1 if less than, 1 if greater than. +#' \item direction - Relationship to feature value, -1 if less than, 1 if +#' greater than. #' \item value - The feature value which defines the decision boundary -#' \item stat - The performance value returned by the splitting metric for this split. -#' \item statUsed - Which performance metric was used. "IG" if information gain and "OO" if one-off. -#' \item level - The level of the tree at which is rule was defined. 1 is the level of the first split of the tree. +#' \item stat - The performance value returned by the splitting metric for +#' this split. +#' \item statUsed - Which performance metric was used. "IG" if information +#' gain and "OO" if one-off. +#' \item level - The level of the tree at which is rule was defined. 1 is the +#' level of the first split of the tree. #' } #' \item dendro - A dendrogram object of the decision tree output -#' \item prediction - A character vector of label of predictions of the training data using the final model. "MISSING" if label prediction was ambiguous. -#' \item performance - A named list denoting the training performance of the model. +#' \item summaryMatrix - A K(labels) by L(features) matrix representation of +#' the decision rules. Columns denote features and rows denote labels. Non-0 +#' values denote instances where a feature was used on a given label. Positive +#' and negative values denote whether the values of the label for that feature +#' were greater than or less than the decision threshold, respectively. The +#' magnitude of Non-0 values denote the level at which the feature was used, +#' where the first split has a magnitude of 1. Note, if reuse_features = TRUE, +#' only the final usage of a feature for a given label is shown. +#' \item prediction - A character vector of label of predictions of the +#' training data using the final model. "MISSING" if label prediction was +#' ambiguous. +#' \item performance - A named list denoting the training performance of the +#' model. #' \itemize{ #' \item accuracy - (number correct/number of samples) for the whole set of #' samples. @@ -41,23 +64,23 @@ #' @examples #' library(M3DExampleData) #' counts <- M3DExampleData::Mmus_example_list$data -#' # Subset 500 genes for fast clustering -#' counts <- as.matrix(counts[1501:2000, ]) -#' # Cluster genes ans samples each into 10 modules +#' # subset 100 genes for fast clustering +#' counts <- as.matrix(counts[1500:2000, ]) +#' # cluster genes into 10 modules for quick demo #' cm <- celda_CG(counts = counts, L = 10, K = 5, verbose = FALSE) #' # Get features matrix and cluster assignments #' factorized <- factorizeMatrix(counts, cm) #' features <- factorized$proportions$cell #' class <- clusters(cm)$z #' # Generate Decision Tree -#' decTree <- findMarkers(features, -#' class, -#' oneoffMetric = "modified F1", -#' threshold = 1, -#' consecutiveOneoff = FALSE) +#' DecTree <- buildTreeHybrid(features, +#' class, +#' oneoffMetric = "modified F1", +#' threshold = 1, +#' consecutiveOneoff = FALSE) #' #' # Plot dendrogram -#' plotDendro(decTree) +#' plotDendro(DecTree) #' @import magrittr #' @export findMarkers <- function(features, @@ -69,14 +92,17 @@ findMarkers <- function(features, altSplit = TRUE, consecutiveOneoff = TRUE) { - if (ncol(features) != length(class)) + if (ncol(features) != length(class)) { stop("Number of columns of features must equal length of class") + } - if (any(is.na(class))) + if (any(is.na(class))) { stop("NA class values") + } - if (any(is.na(features))) + if (any(is.na(features))){ stop("NA feature values") + } # Match the oneoffMetric argument oneoffMetric <- match.arg(oneoffMetric) @@ -84,7 +110,7 @@ findMarkers <- function(features, # Transpose features features <- t(features) - #If no detailed cell types are provided + # If no detailed cell types are provided if(!hasArg(cellTypes)){ print('Building tree...') @@ -102,8 +128,9 @@ findMarkers <- function(features, consecutiveOneoff) # Add alternative node for the solely down-regulated leaf - if (altSplit) + if (altSplit) { tree <- .addAlternativeSplit(tree, features, class) + } print('Computing performance metrics...') @@ -111,50 +138,51 @@ findMarkers <- function(features, DTsummary <- .summarizeTree(tree, features, class) return(DTsummary) - } - - # If detailed cell types are provided - else{ - + } else { + # If detailed cell types are provided + # Check that cell types match class labels - if(mean(unlist(cellTypes) %in% unique(class)) != 1) - stop("Provided cell types and class labels do not match. Please check the 'cellTypes' argument.") + if(mean(unlist(cellTypes) %in% unique(class)) != 1) { + stop("Provided cell types do not match class labels. + Please check the 'cellTypes' argument.") + } # Create vector with cell type class labels newLabels <- class for (i in names(cellTypes)) { - newLabels[newLabels %in% cellTypes[[i]]] <- i + newLabels[newLabels %in% cellTypes[[i]]] <- i } - + # Update cell subtype labels subtypeLabels <- newLabels subtypeLabels[subtypeLabels %in% names(cellTypes)] <- paste0( - subtypeLabels[subtypeLabels %in% names(cellTypes)], - "(", - class[subtypeLabels %in% names(cellTypes)], - ")" + subtypeLabels[subtypeLabels %in% names(cellTypes)], + "(", + class[subtypeLabels %in% names(cellTypes)], + ")" ) - ## Create tree for cell types ## + # Create tree for cell types print('Building tree for all cell types...') - tree <- .generateTreeList(features, as.factor(newLabels), oneoffMetric, threshold, reuseFeatures, consecutiveOneoff) + tree <- .generateTreeList(features, as.factor(newLabels), oneoffMetric, + threshold, reuseFeatures, consecutiveOneoff) tree <- list( rules = .mapClass2features(tree, features, as.factor(newLabels))$rules, dendro = .convertToDendrogram(tree, as.factor(newLabels)) ) - #Store tree's dendrogram in a separate variable + # Store tree's dendrogram in a separate variable dendro <- tree$dendro - #Find which cell types have more than one cluster + # Find which cell types have more than one cluster largeCellTypes <- names(cellTypes[lengths(cellTypes) > 1]) - #Create separate trees for each cell type with more than one cluster + # Create separate trees for each cell type with more than one cluster newTrees <- lapply(unique(newLabels), function(cellType){ - #Create tree for this cell type only + # Create tree for this cell type only if (cellType %in% largeCellTypes) { - + # Print current status print(paste('Building tree for cell type:', cellType)) @@ -164,14 +192,20 @@ findMarkers <- function(features, featUse <- featUse[!featUse %in% tree$rules[[cellType]]$feature] } - # Create new Tree - newTree <- .generateTreeList(features[newLabels == cellType, featUse], as.factor(subtypeLabels[newLabels == cellType]), oneoffMetric, threshold, reuseFeatures, consecutiveOneoff) + # Create new tree + newTree <- .generateTreeList(features[newLabels == cellType, featUse], + as.factor(subtypeLabels[newLabels == cellType]), + oneoffMetric, threshold, + reuseFeatures, consecutiveOneoff) newTree <- list( - rules = .mapClass2features(newTree, features[newLabels == cellType,], as.factor(subtypeLabels[newLabels == cellType]))$rules, - dendro = .convertToDendrogram(newTree, as.factor(subtypeLabels[newLabels == cellType])) + rules = .mapClass2features(newTree, + features[newLabels == cellType,], + as.factor(subtypeLabels[newLabels == cellType]))$rules, + dendro = .convertToDendrogram(newTree, + as.factor(subtypeLabels[newLabels == cellType])) ) - #Adjust 'rules' table for new tree + # Adjust 'rules' table for new tree newTree$rules <- lapply(newTree$rules, function(rules){ rules$level <- rules$level + max(tree$rules[[cellType]]$level) rules <- rbind(tree$rules[[cellType]], rules) @@ -180,17 +214,19 @@ findMarkers <- function(features, return(newTree) } else{ - + # Adjust name of class if it contains only one cluster - names(tree$rules)[which(names(tree$rules) == cellType)] <<- paste0(cellType,'(', unlist(cellTypes[cellType]) ,')') + names(tree$rules)[which(names(tree$rules) == cellType)] <<- paste0( + cellType, '(', unlist(cellTypes[cellType]),')') return() } }) - #Remove empty trees for single-cluster cell types + + # Remove empty trees for single-cluster cell types newTrees <- newTrees[lengths(newTrees) > 0] names(newTrees) <- unique(newLabels)[unique(newLabels) %in% largeCellTypes] - #Find indices of cell type nodes in tree + # Find indices of cell type nodes in tree indices <- lapply(unique(newLabels)[unique(newLabels) %in% largeCellTypes], function(cellType) { # Initialize sub trees, indices string, and flag dendSub <- dendro @@ -199,7 +235,8 @@ findMarkers <- function(features, while (flag) { # Get the edge with the class of interest - whEdge <- which(unlist(lapply(dendSub, function(edge) cellType %in% attributes(edge)$classLabels))) + whEdge <- which(unlist(lapply(dendSub, function(edge) + cellType %in% attributes(edge)$classLabels))) # Add this as a string index <- paste0(index, "[[", whEdge, "]]") @@ -215,59 +252,64 @@ findMarkers <- function(features, }) names(indices) <- unique(newLabels)[unique(newLabels) %in% largeCellTypes] - #Add each cell type tree + # Add each cell type tree for(cellType in names(newTrees)){ - #Get current tree + # Get current tree cellTypeDendro <- newTrees[[cellType]]$dendro - #Nudge nodes upward to make room for new tree + # Nudge nodes upward to make room for new tree dendro <- dendrapply(dendro, function(node) { if(attributes(node)$height > 1) - attributes(node)$height <- attributes(node)$height + attributes(cellTypeDendro)$height - 1 + attributes(node)$height <- attributes(node)$height + + attributes(cellTypeDendro)$height - 1 return(node) }) - #Adjust labels, member count, and midpoint of nodes + # Adjust labels, member count, and midpoint of nodes dendro <- dendrapply(dendro, function(node){ - #Check if in right branch + # Check if in right branch if(cellType %in% as.character(attributes(node)$classLabels)){ - #assign new labels replacing old one (i.e. replace cell type label with subtype labels) - attributes(node)$classLabels <- as.character(attributes(node)$classLabels) %>% + # Replace cell type label with subtype labels + attributes(node)$classLabels <- + as.character(attributes(node)$classLabels) %>% .[. != cellType] %>% c(., unique(subtypeLabels)[grep(cellType, unique(subtypeLabels))]) - #assign new member count for this branch + # Assign new member count for this branch attributes(node)$members <- length(attributes(node)$classLabels) - #assign new midpoint for this branch + # Assign new midpoint for this branch attributes(node)$midpoint <- (attributes(node)$members - 1) / 2 } return(node) }) - #Replace label at new tree's branch point - branchPointLabel <- attributes(eval(parse(text = paste0("dendro", indices[[cellType]]))))$label - branchPointStatUsed <- attributes(eval(parse(text = paste0("dendro", indices[[cellType]]))))$statUsed + # Replace label at new tree's branch point + branchPointLabel <- attributes(eval(parse( + text = paste0("dendro", indices[[cellType]]))))$label + branchPointStatUsed <- attributes(eval(parse( + text = paste0("dendro", indices[[cellType]]))))$statUsed + if(!is.null(branchPointLabel)) { attributes(cellTypeDendro)$label <- branchPointLabel attributes(cellTypeDendro)$statUsed <- branchPointStatUsed } - #Add new tree to original tree + # Add new tree to original tree eval(parse(text = paste0("dendro", indices[[cellType]], " <- cellTypeDendro"))) - #Append new tree's 'rules' tables to original tree + # Append new tree's 'rules' tables to original tree tree$rules <- append(tree$rules, newTrees[[cellType]]$rules) - #Remove old tree's rules + # Remove old tree's rules tree$rules <- tree$rules[-which(names(tree$rules) == cellType)] } - #Set final tree dendro + # Set final tree dendro tree$dendro <- dendro - #Get performance metrics + # Get performance metrics print('Computing performance metrics...') perfList <- .getPerformance(tree$rules, features, as.factor(subtypeLabels)) tree$prediction <- perfList$prediction From 2e9a54eae0e04d751e90e8fa171bca25ef4515cd Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Tue, 28 May 2019 11:25:21 -0400 Subject: [PATCH 013/149] add alternate headings support for plotDimReduceFeature --- DESCRIPTION | 2 +- R/plot_dr.R | 106 ++++++++++++++++++++++++++---------- inst/NEWS | 4 ++ man/plotDimReduceFeature.Rd | 7 ++- man/plotDimReduceGrid.Rd | 5 +- 5 files changed, 92 insertions(+), 32 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d1e6819e..59b7aa9f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: celda Title: CEllular Latent Dirichlet Allocation -Version: 1.1.3 +Version: 1.1.4 Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")), diff --git a/R/plot_dr.R b/R/plot_dr.R index 77990c8e..0e5dcbf3 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -19,6 +19,8 @@ #' The color will be used to signify the highest values on the scale. #' Default 'blue'. #' @param varLabel Character vector. Title for the color legend. +#' @param headers Character vector. If `NULL`, the corresponding rownames are +#' used as labels. Otherwise, these headers are used to label the genes. #' @return The plot as a ggplot object #' @examples #' data(celdaCGSim, celdaCGMod) @@ -45,7 +47,8 @@ plotDimReduceGrid <- function(dim1, colorLow, colorMid, colorHigh, - varLabel) { + varLabel, + headers) { df <- data.frame(dim1, dim2, t(as.data.frame(matrix))) naIx <- is.na(dim1) | is.na(dim2) @@ -54,24 +57,49 @@ plotDimReduceGrid <- function(dim1, m <- reshape2::melt(df, id.vars = c("dim1", "dim2")) colnames(m) <- c(xlab, ylab, "facet", varLabel) - ggplot2::ggplot(m, - ggplot2::aes_string(x = xlab, y = ylab)) + - ggplot2::geom_point(stat = "identity", - size = size, - ggplot2::aes_string(color = varLabel)) + - ggplot2::facet_wrap(~ facet) + - ggplot2::theme_bw() + - ggplot2::scale_colour_gradient2(low = colorLow, - high = colorHigh, - mid = colorMid, - midpoint = (max(m[, 4]) + min(m[, 4])) / 2, - name = gsub("_", " ", varLabel)) + - ggplot2::theme(strip.background = ggplot2::element_blank(), - panel.grid.major = ggplot2::element_blank(), - panel.grid.minor = ggplot2::element_blank(), - panel.spacing = unit(0, "lines"), - panel.background = ggplot2::element_blank(), - axis.line = ggplot2::element_line(colour = "black")) + if (isFALSE(is.null(headers))) { + names(headers) <- levels(m$facet) + headers <- ggplot2::as_labeller(headers) + + g <- ggplot2::ggplot(m, + ggplot2::aes_string(x = xlab, y = ylab)) + + ggplot2::geom_point(stat = "identity", + size = size, + ggplot2::aes_string(color = varLabel)) + + ggplot2::facet_wrap(~ facet, labeller = headers) + + ggplot2::theme_bw() + + ggplot2::scale_colour_gradient2(low = colorLow, + high = colorHigh, + mid = colorMid, + midpoint = (max(m[, 4]) + min(m[, 4])) / 2, + name = gsub("_", " ", varLabel)) + + ggplot2::theme(strip.background = ggplot2::element_blank(), + panel.grid.major = ggplot2::element_blank(), + panel.grid.minor = ggplot2::element_blank(), + panel.spacing = unit(0, "lines"), + panel.background = ggplot2::element_blank(), + axis.line = ggplot2::element_line(colour = "black")) + } else { + g <- ggplot2::ggplot(m, + ggplot2::aes_string(x = xlab, y = ylab)) + + ggplot2::geom_point(stat = "identity", + size = size, + ggplot2::aes_string(color = varLabel)) + + ggplot2::facet_wrap(~ facet) + + ggplot2::theme_bw() + + ggplot2::scale_colour_gradient2(low = colorLow, + high = colorHigh, + mid = colorMid, + midpoint = (max(m[, 4]) + min(m[, 4])) / 2, + name = gsub("_", " ", varLabel)) + + ggplot2::theme(strip.background = ggplot2::element_blank(), + panel.grid.major = ggplot2::element_blank(), + panel.grid.minor = ggplot2::element_blank(), + panel.spacing = unit(0, "lines"), + panel.background = ggplot2::element_blank(), + axis.line = ggplot2::element_line(colour = "black")) + } + return(g) } @@ -87,6 +115,8 @@ plotDimReduceGrid <- function(dim1, #' @param counts Integer matrix. Rows represent features and columns #' represent cells. #' @param features Character vector. Uses these genes for plotting. +#' @param headers Character vector. If `NULL`, the corresponding rownames are +#' used as labels. Otherwise, these headers are used to label the genes. #' @param normalize Logical. Whether to normalize the columns of `counts`. #' Default TRUE. #' @param exactMatch Logical. Whether an exact match or a partial match using @@ -121,6 +151,7 @@ plotDimReduceFeature <- function(dim1, dim2, counts, features, + headers = NULL, normalize = TRUE, exactMatch = TRUE, trim = c(-2, 2), @@ -130,6 +161,16 @@ plotDimReduceFeature <- function(dim1, colorLow = "grey", colorMid = NULL, colorHigh = "blue") { + + if (isFALSE(is.null(headers))) { + if (length(headers) != length(features)) { + stop("Headers ", + headers, + " should be the same length as features ", + features) + } + } + if (isTRUE(normalize)) { counts <- normalizeCounts(counts, transformationFun = sqrt, @@ -152,15 +193,21 @@ plotDimReduceFeature <- function(dim1, varLabel <- "Expression" if (!isTRUE(exactMatch)) { - featuresIndices <- c() - notFound <- c() - for (gene in features) { - featuresIndices <- - c(featuresIndices, grep(gene, rownames(counts))) - if (length(grep(gene, rownames(counts))) == 0) { - notFound <- c(notFound, gene) + featuresIndices <- integer(length(features)) + notFound <- character(length(features)) + headersFound <- character(length(features)) + for (i in seq_along(features)) { + featuresIndices[i] <- grep(features[i], rownames(counts)) + if (length(grep(features[i], rownames(counts))) == 0) { + notFound[i] <- features[i] + } else { + headersFound[i] <- headers[i] } } + + notFound <- notFound[notFound != ""] + headers <- headersFound[headersFound != ""] + counts <- counts[featuresIndices, , drop = FALSE] if (length(notFound) > 0) { if (length(notFound) == length(features)) { @@ -172,10 +219,11 @@ plotDimReduceFeature <- function(dim1, paste(notFound, sep = "", collapse = ",")) - } + } } else { featuresNotFound <- setdiff(features, intersect(features, rownames(counts))) + whichHeadersNotFound <- which(featuresNotFound == features) if (length(featuresNotFound) > 0) { if (length(featuresNotFound) == length(features)) { stop("None of the provided features had matching", @@ -188,6 +236,7 @@ plotDimReduceFeature <- function(dim1, collapse = ",")) } featuresFound <- setdiff(features, featuresNotFound) + headers <- headers[-whichHeadersNotFound] counts <- counts[featuresFound, , drop = FALSE] } plotDimReduceGrid(dim1, @@ -199,7 +248,8 @@ plotDimReduceFeature <- function(dim1, colorLow, colorMid, colorHigh, - varLabel) + varLabel, + headers) } diff --git a/inst/NEWS b/inst/NEWS index 4527252f..d5795a41 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,7 @@ +Changes in version 1.1.4 (2019-05-28): + + o Add Alternate headings support for plotDimReduceFeature + Changes in version 1.1.3 (2019-05-14): o Add multiclass decision tree (MCDT) cell cluster annotation diff --git a/man/plotDimReduceFeature.Rd b/man/plotDimReduceFeature.Rd index 37fe4f76..e0152836 100644 --- a/man/plotDimReduceFeature.Rd +++ b/man/plotDimReduceFeature.Rd @@ -4,8 +4,8 @@ \alias{plotDimReduceFeature} \title{Plotting feature expression on a dimensionality reduction plot} \usage{ -plotDimReduceFeature(dim1, dim2, counts, features, normalize = TRUE, - exactMatch = TRUE, trim = c(-2, 2), size = 1, +plotDimReduceFeature(dim1, dim2, counts, features, headers = NULL, + normalize = TRUE, exactMatch = TRUE, trim = c(-2, 2), size = 1, xlab = "Dimension_1", ylab = "Dimension_2", colorLow = "grey", colorMid = NULL, colorHigh = "blue") } @@ -21,6 +21,9 @@ represent cells.} \item{features}{Character vector. Uses these genes for plotting.} +\item{headers}{Character vector. If `NULL`, the corresponding rownames are +used as labels. Otherwise, these headers are used to label the genes.} + \item{normalize}{Logical. Whether to normalize the columns of `counts`. Default TRUE.} diff --git a/man/plotDimReduceGrid.Rd b/man/plotDimReduceGrid.Rd index 164544c6..922a22d2 100644 --- a/man/plotDimReduceGrid.Rd +++ b/man/plotDimReduceGrid.Rd @@ -5,7 +5,7 @@ \title{Mapping the dimensionality reduction plot} \usage{ plotDimReduceGrid(dim1, dim2, matrix, size, xlab, ylab, colorLow, colorMid, - colorHigh, varLabel) + colorHigh, varLabel, headers) } \arguments{ \item{dim1}{Numeric vector. First dimension from data dimensionality @@ -35,6 +35,9 @@ The color will be used to signify the highest values on the scale. Default 'blue'.} \item{varLabel}{Character vector. Title for the color legend.} + +\item{headers}{Character vector. If `NULL`, the corresponding rownames are +used as labels. Otherwise, these headers are used to label the genes.} } \value{ The plot as a ggplot object From 285068547ab7b7ac57344133fe4640af9bf2fce0 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Tue, 28 May 2019 11:42:56 -0400 Subject: [PATCH 014/149] fix bug --- R/plot_dr.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index 0e5dcbf3..8e0f2152 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -223,7 +223,6 @@ plotDimReduceFeature <- function(dim1, } else { featuresNotFound <- setdiff(features, intersect(features, rownames(counts))) - whichHeadersNotFound <- which(featuresNotFound == features) if (length(featuresNotFound) > 0) { if (length(featuresNotFound) == length(features)) { stop("None of the provided features had matching", @@ -234,9 +233,11 @@ plotDimReduceFeature <- function(dim1, paste(featuresNotFound, sep = "", collapse = ",")) + + whichHeadersNotFound <- which(featuresNotFound == features) + headers <- headers[-whichHeadersNotFound] } featuresFound <- setdiff(features, featuresNotFound) - headers <- headers[-whichHeadersNotFound] counts <- counts[featuresFound, , drop = FALSE] } plotDimReduceGrid(dim1, From 3ffaf3d0ca7a623f4b9ed58f513f42701fdb78aa Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Tue, 28 May 2019 11:59:32 -0400 Subject: [PATCH 015/149] fix bug --- R/plot_dr.R | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index 8e0f2152..8de5a63b 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -195,18 +195,25 @@ plotDimReduceFeature <- function(dim1, if (!isTRUE(exactMatch)) { featuresIndices <- integer(length(features)) notFound <- character(length(features)) - headersFound <- character(length(features)) + if (isFALSE(is.null(headers))) { + headersFound <- character(length(features)) + } + for (i in seq_along(features)) { featuresIndices[i] <- grep(features[i], rownames(counts)) if (length(grep(features[i], rownames(counts))) == 0) { notFound[i] <- features[i] } else { - headersFound[i] <- headers[i] + if (isFALSE(is.null(headers))) { + headersFound[i] <- headers[i] + } } } notFound <- notFound[notFound != ""] - headers <- headersFound[headersFound != ""] + if (isFALSE(is.null(headers))) { + headers <- headersFound[headersFound != ""] + } counts <- counts[featuresIndices, , drop = FALSE] if (length(notFound) > 0) { @@ -233,10 +240,11 @@ plotDimReduceFeature <- function(dim1, paste(featuresNotFound, sep = "", collapse = ",")) - - whichHeadersNotFound <- which(featuresNotFound == features) - headers <- headers[-whichHeadersNotFound] + if (isFALSE(is.null(headers))) { + whichHeadersNotFound <- which(featuresNotFound == features) + headers <- headers[-whichHeadersNotFound] } + } featuresFound <- setdiff(features, featuresNotFound) counts <- counts[featuresFound, , drop = FALSE] } From 9146ad74a37ac44fc67ecdcca6b7ecd1510ba6cf Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Tue, 28 May 2019 12:12:59 -0400 Subject: [PATCH 016/149] headers = NULL if exactMatch == FALSE --- R/plot_dr.R | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index 8de5a63b..b97e51ee 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -48,7 +48,7 @@ plotDimReduceGrid <- function(dim1, colorMid, colorHigh, varLabel, - headers) { + headers = NULL) { df <- data.frame(dim1, dim2, t(as.data.frame(matrix))) naIx <- is.na(dim1) | is.na(dim2) @@ -169,6 +169,10 @@ plotDimReduceFeature <- function(dim1, " should be the same length as features ", features) } + + if (isFALSE(exactMatch)) { + warning("exactMatch is FALSE. headers will not be used!") + } } if (isTRUE(normalize)) { @@ -193,28 +197,15 @@ plotDimReduceFeature <- function(dim1, varLabel <- "Expression" if (!isTRUE(exactMatch)) { - featuresIndices <- integer(length(features)) - notFound <- character(length(features)) - if (isFALSE(is.null(headers))) { - headersFound <- character(length(features)) - } - - for (i in seq_along(features)) { - featuresIndices[i] <- grep(features[i], rownames(counts)) - if (length(grep(features[i], rownames(counts))) == 0) { - notFound[i] <- features[i] - } else { - if (isFALSE(is.null(headers))) { - headersFound[i] <- headers[i] - } + featuresIndices <- c() + notFound <- c() + for (gene in features) { + featuresIndices <- + c(featuresIndices, grep(gene, rownames(counts))) + if (length(grep(gene, rownames(counts))) == 0) { + notFound <- c(notFound, gene) } } - - notFound <- notFound[notFound != ""] - if (isFALSE(is.null(headers))) { - headers <- headersFound[headersFound != ""] - } - counts <- counts[featuresIndices, , drop = FALSE] if (length(notFound) > 0) { if (length(notFound) == length(features)) { From 2ab51bdf403a4515f00a709a21bfcc3072198c91 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Tue, 28 May 2019 13:20:37 -0400 Subject: [PATCH 017/149] headers = NULL --- R/plot_dr.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/plot_dr.R b/R/plot_dr.R index b97e51ee..535b3231 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -172,6 +172,7 @@ plotDimReduceFeature <- function(dim1, if (isFALSE(exactMatch)) { warning("exactMatch is FALSE. headers will not be used!") + headers <- NULL } } From dc093399d2061d791855688a7b423fb95f35508b Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Tue, 28 May 2019 14:06:02 -0400 Subject: [PATCH 018/149] update docs --- R/celda_C.R | 2 +- R/celda_CG.R | 2 +- man/celda_C.Rd | 2 +- man/celda_CG.Rd | 2 +- man/plotDimReduceGrid.Rd | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/celda_C.R b/R/celda_C.R index e2270849..fc889b90 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -46,7 +46,7 @@ #' `logfile`. If NULL, messages will be printed to stdout. Default NULL. #' @param verbose Logical. Whether to print log messages. Default TRUE. #' @return An object of class `celda_C` with the cell population clusters -#' stored in in `z`. +#' stored in `z`. #' @seealso `celda_G()` for feature clustering and `celda_CG()` for simultaneous #' clustering of features and cells. `celdaGridSearch()` can be used to run #' multiple values of K and multiple chains in parallel. diff --git a/R/celda_CG.R b/R/celda_CG.R index b17f153c..2946ba87 100755 --- a/R/celda_CG.R +++ b/R/celda_CG.R @@ -58,7 +58,7 @@ #' `logfile`. If NULL, messages will be printed to stdout. Default NULL. #' @param verbose Logical. Whether to print log messages. Default TRUE. #' @return An object of class `celda_CG` with the cell populations clusters -#' stored in in `z` and feature module clusters stored in `y`. +#' stored in `z` and feature module clusters stored in `y`. #' @seealso `celda_G()` for feature clustering and `celda_C()` for clustering #' cells. `celdaGridSearch()` can be used to run multiple values of K/L and #' multiple chains in parallel. diff --git a/man/celda_C.Rd b/man/celda_C.Rd index f4389270..654389bf 100644 --- a/man/celda_C.Rd +++ b/man/celda_C.Rd @@ -75,7 +75,7 @@ can only be used when `initialize = 'random'`. Default NULL.} } \value{ An object of class `celda_C` with the cell population clusters - stored in in `z`. + stored in `z`. } \description{ Clusters the columns of a count matrix containing single-cell diff --git a/man/celda_CG.Rd b/man/celda_CG.Rd index c12e0ecf..4d36d5cb 100644 --- a/man/celda_CG.Rd +++ b/man/celda_CG.Rd @@ -94,7 +94,7 @@ starting values for each feature will be randomly sampled from 1:L. } \value{ An object of class `celda_CG` with the cell populations clusters - stored in in `z` and feature module clusters stored in `y`. + stored in `z` and feature module clusters stored in `y`. } \description{ Clusters the rows and columns of a count matrix containing diff --git a/man/plotDimReduceGrid.Rd b/man/plotDimReduceGrid.Rd index 922a22d2..1c0b1f3d 100644 --- a/man/plotDimReduceGrid.Rd +++ b/man/plotDimReduceGrid.Rd @@ -5,7 +5,7 @@ \title{Mapping the dimensionality reduction plot} \usage{ plotDimReduceGrid(dim1, dim2, matrix, size, xlab, ylab, colorLow, colorMid, - colorHigh, varLabel, headers) + colorHigh, varLabel, headers = NULL) } \arguments{ \item{dim1}{Numeric vector. First dimension from data dimensionality From 340d49e6d63d67d2c17baaabee87105ff1bab66f Mon Sep 17 00:00:00 2001 From: ericreed Date: Tue, 28 May 2019 16:27:10 -0400 Subject: [PATCH 019/149] Small formatting fix --- R/buildTreeHelper.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/buildTreeHelper.R b/R/buildTreeHelper.R index b6a8de70..f7f99efe 100644 --- a/R/buildTreeHelper.R +++ b/R/buildTreeHelper.R @@ -319,10 +319,10 @@ splitStats <- vapply( colnames(features), function(feat, features, class, splitMetric) { - splitMetric(feat, class, features, rPerf = T) + splitMetric(feat, class, features, rPerf = TRUE) }, features, class, splitMetric, FUN.VALUE = double(1)) names(splitStats) <- colnames(features) - splitStats <- sort(splitStats, decreasing = T) + splitStats <- sort(splitStats, decreasing = TRUE) return(splitStats) } @@ -412,7 +412,7 @@ featValues <- features[, feat] # Get order of values - ord <- order(featValues, decreasing = T) + ord <- order(featValues, decreasing = TRUE) # Get sorted class and values featValuesSort <- featValues[ord] @@ -516,7 +516,7 @@ featValues <- features[, feat] # Get order of values - ord <- order(featValues, decreasing = T) + ord <- order(featValues, decreasing = TRUE) # Get sorted class and values featValuesSort <- featValues[ord] @@ -835,7 +835,7 @@ stat = HM, stringsAsFactors = F)) }, .splitMetricModF1, fSub, cSub, group2only)) - altStats <- altStats[order(altStats$stat, decreasing = T), ] + altStats <- altStats[order(altStats$stat, decreasing = TRUE), ] # Get alternative splits splitStats <- altStats$stat[1] From 8adfd727b9ed7bdbc82e28d5c0a3ca50b3733499 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Mon, 3 Jun 2019 13:49:45 -0400 Subject: [PATCH 020/149] remove redundant clustSplit --- R/split_clusters.R | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/R/split_clusters.R b/R/split_clusters.R index 676f0d1f..33f02ccc 100644 --- a/R/split_clusters.R +++ b/R/split_clusters.R @@ -31,16 +31,6 @@ } ## Loop through each split-able Z and perform split - clustSplit <- lapply(zToSplit, function(x) { - clusters(.celda_C(counts[, z == x], - K = 2, - zInitialize = "random", - maxIter = 5, - splitOnIter = -1, - splitOnLast = FALSE, - verbose = FALSE))$z - }) - clustSplit <- vector("list", K) for (i in zToSplit) { clustLabel <- .celda_C( From e4ff4adefde7fa6f6c744edba13848ce3f6eb9c4 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Mon, 3 Jun 2019 13:51:17 -0400 Subject: [PATCH 021/149] version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 59b7aa9f..9cc6b688 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: celda Title: CEllular Latent Dirichlet Allocation -Version: 1.1.4 +Version: 1.1.5 Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")), From bc4a3744436660838d3408e731e1c402deceaf00 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Mon, 3 Jun 2019 13:53:01 -0400 Subject: [PATCH 022/149] remove redundant splitClust assignment --- R/split_clusters.R | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/R/split_clusters.R b/R/split_clusters.R index 676f0d1f..33f02ccc 100644 --- a/R/split_clusters.R +++ b/R/split_clusters.R @@ -31,16 +31,6 @@ } ## Loop through each split-able Z and perform split - clustSplit <- lapply(zToSplit, function(x) { - clusters(.celda_C(counts[, z == x], - K = 2, - zInitialize = "random", - maxIter = 5, - splitOnIter = -1, - splitOnLast = FALSE, - verbose = FALSE))$z - }) - clustSplit <- vector("list", K) for (i in zToSplit) { clustLabel <- .celda_C( From e2cf9f91a1358a1714cd71c2b5a6da8897fb6c62 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Mon, 3 Jun 2019 13:53:29 -0400 Subject: [PATCH 023/149] version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3ef7aa5a..2a7e8aea 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: celda Title: CEllular Latent Dirichlet Allocation -Version: 1.0.3 +Version: 1.0.4 Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")), From d7993e4cbc8fe4da29976c97477d822764651673 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Fri, 7 Jun 2019 11:42:09 -0400 Subject: [PATCH 024/149] fix topRank doc --- R/feature_selection.R | 2 +- man/topRank.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/feature_selection.R b/R/feature_selection.R index 1f9cac4b..f4e5a140 100755 --- a/R/feature_selection.R +++ b/R/feature_selection.R @@ -9,7 +9,7 @@ #' columns. Default 2. #' @param threshold Numeric. Only return ranked rows or columns in the matrix #' that are above this threshold. If NULL, then no threshold will be applied. -#' Default 1. +#' Default 0. #' @param decreasing Logical. Specifies if the rank should be decreasing. #' Default TRUE. #' @return List. The `index` variable provides the top `n` row (feature) indices diff --git a/man/topRank.Rd b/man/topRank.Rd index ff3bbb3b..3b78ca01 100644 --- a/man/topRank.Rd +++ b/man/topRank.Rd @@ -18,7 +18,7 @@ columns. Default 2.} \item{threshold}{Numeric. Only return ranked rows or columns in the matrix that are above this threshold. If NULL, then no threshold will be applied. -Default 1.} +Default 0.} \item{decreasing}{Logical. Specifies if the rank should be decreasing. Default TRUE.} From 17af51f0f0ca671664547e051db42d4a621ce7c9 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Fri, 7 Jun 2019 11:43:00 -0400 Subject: [PATCH 025/149] fix topRank doc --- R/feature_selection.R | 2 +- man/topRank.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/feature_selection.R b/R/feature_selection.R index 1f9cac4b..f4e5a140 100755 --- a/R/feature_selection.R +++ b/R/feature_selection.R @@ -9,7 +9,7 @@ #' columns. Default 2. #' @param threshold Numeric. Only return ranked rows or columns in the matrix #' that are above this threshold. If NULL, then no threshold will be applied. -#' Default 1. +#' Default 0. #' @param decreasing Logical. Specifies if the rank should be decreasing. #' Default TRUE. #' @return List. The `index` variable provides the top `n` row (feature) indices diff --git a/man/topRank.Rd b/man/topRank.Rd index ff3bbb3b..3b78ca01 100644 --- a/man/topRank.Rd +++ b/man/topRank.Rd @@ -18,7 +18,7 @@ columns. Default 2.} \item{threshold}{Numeric. Only return ranked rows or columns in the matrix that are above this threshold. If NULL, then no threshold will be applied. -Default 1.} +Default 0.} \item{decreasing}{Logical. Specifies if the rank should be decreasing. Default TRUE.} From 0deb8fdb4ddd3b1de89446764f628bb38be46e44 Mon Sep 17 00:00:00 2001 From: ericreed Date: Tue, 11 Jun 2019 11:22:35 -0400 Subject: [PATCH 026/149] Minor formatting changes for style guide --- R/findMarkers.R | 150 +++++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 72 deletions(-) diff --git a/R/findMarkers.R b/R/findMarkers.R index b75a325a..70623d82 100644 --- a/R/findMarkers.R +++ b/R/findMarkers.R @@ -91,33 +91,33 @@ findMarkers <- function(features, reuseFeatures = FALSE, altSplit = TRUE, consecutiveOneoff = TRUE) { - + if (ncol(features) != length(class)) { stop("Number of columns of features must equal length of class") } - + if (any(is.na(class))) { stop("NA class values") } - - if (any(is.na(features))){ + + if (any(is.na(features))){ stop("NA feature values") } - + # Match the oneoffMetric argument oneoffMetric <- match.arg(oneoffMetric) - + # Transpose features features <- t(features) - + # If no detailed cell types are provided if(!hasArg(cellTypes)){ - + print('Building tree...') - + # Set class to factor class <- as.factor(class) - + # Generate list of tree levels tree <- .generateTreeList( features, @@ -126,33 +126,33 @@ findMarkers <- function(features, threshold, reuseFeatures, consecutiveOneoff) - + # Add alternative node for the solely down-regulated leaf if (altSplit) { tree <- .addAlternativeSplit(tree, features, class) } - + print('Computing performance metrics...') - + # Format tree output for plotting and generate summary statistics DTsummary <- .summarizeTree(tree, features, class) - + return(DTsummary) } else { # If detailed cell types are provided - + # Check that cell types match class labels if(mean(unlist(cellTypes) %in% unique(class)) != 1) { stop("Provided cell types do not match class labels. Please check the 'cellTypes' argument.") } - + # Create vector with cell type class labels newLabels <- class for (i in names(cellTypes)) { newLabels[newLabels %in% cellTypes[[i]]] <- i } - + # Update cell subtype labels subtypeLabels <- newLabels subtypeLabels[subtypeLabels %in% names(cellTypes)] <- paste0( @@ -161,7 +161,7 @@ findMarkers <- function(features, class[subtypeLabels %in% names(cellTypes)], ")" ) - + # Create tree for cell types print('Building tree for all cell types...') tree <- .generateTreeList(features, as.factor(newLabels), oneoffMetric, @@ -170,94 +170,99 @@ findMarkers <- function(features, rules = .mapClass2features(tree, features, as.factor(newLabels))$rules, dendro = .convertToDendrogram(tree, as.factor(newLabels)) ) - + # Store tree's dendrogram in a separate variable dendro <- tree$dendro - + # Find which cell types have more than one cluster largeCellTypes <- names(cellTypes[lengths(cellTypes) > 1]) - + # Create separate trees for each cell type with more than one cluster newTrees <- lapply(unique(newLabels), function(cellType){ - + # Create tree for this cell type only if (cellType %in% largeCellTypes) { - + # Print current status print(paste('Building tree for cell type:', cellType)) - + # Remove used features featUse <- colnames(features) if (!reuseFeatures) { featUse <- featUse[!featUse %in% tree$rules[[cellType]]$feature] } - + # Create new tree newTree <- .generateTreeList(features[newLabels == cellType, featUse], - as.factor(subtypeLabels[newLabels == cellType]), + as.factor(subtypeLabels[ + newLabels == cellType]), oneoffMetric, threshold, reuseFeatures, consecutiveOneoff) newTree <- list( rules = .mapClass2features(newTree, features[newLabels == cellType,], - as.factor(subtypeLabels[newLabels == cellType]))$rules, + as.factor(subtypeLabels[ + newLabels == cellType]))$rules, dendro = .convertToDendrogram(newTree, - as.factor(subtypeLabels[newLabels == cellType])) + as.factor(subtypeLabels[ + newLabels == cellType])) ) - + # Adjust 'rules' table for new tree newTree$rules <- lapply(newTree$rules, function(rules){ rules$level <- rules$level + max(tree$rules[[cellType]]$level) rules <- rbind(tree$rules[[cellType]], rules) }) - + return(newTree) } else{ - + # Adjust name of class if it contains only one cluster names(tree$rules)[which(names(tree$rules) == cellType)] <<- paste0( cellType, '(', unlist(cellTypes[cellType]),')') return() } }) - + # Remove empty trees for single-cluster cell types newTrees <- newTrees[lengths(newTrees) > 0] names(newTrees) <- unique(newLabels)[unique(newLabels) %in% largeCellTypes] - + # Find indices of cell type nodes in tree - indices <- lapply(unique(newLabels)[unique(newLabels) %in% largeCellTypes], function(cellType) { - # Initialize sub trees, indices string, and flag - dendSub <- dendro - index <- "" - flag <- TRUE - - while (flag) { - # Get the edge with the class of interest - whEdge <- which(unlist(lapply(dendSub, function(edge) - cellType %in% attributes(edge)$classLabels))) - - # Add this as a string - index <- paste0(index, "[[", whEdge, "]]") - - # Move to this branch - dendSub <- eval(parse(text = paste0("dendro", index))) - - # Is this the only class in that branch - flag <- length(attributes(dendSub)$classLabels) > 1 - } - - return(index) - }) + indices <- lapply(unique(newLabels)[unique(newLabels) %in% largeCellTypes], + function(cellType) { + # Initialize sub trees, indices string, and flag + dendSub <- dendro + index <- "" + flag <- TRUE + + while (flag) { + # Get the edge with the class of interest + whEdge <- which(unlist(lapply(dendSub, function(edge) + cellType %in% attributes(edge)$classLabels))) + + # Add this as a string + index <- paste0(index, "[[", whEdge, "]]") + + # Move to this branch + dendSub <- eval(parse(text = paste0("dendro", index))) + + # Is this the only class in that branch + flag <- length(attributes(dendSub)$classLabels) > 1 + } + + return(index) + } + ) names(indices) <- unique(newLabels)[unique(newLabels) %in% largeCellTypes] - + # Add each cell type tree for(cellType in names(newTrees)){ - + # Get current tree cellTypeDendro <- newTrees[[cellType]]$dendro - + # Nudge nodes upward to make room for new tree dendro <- dendrapply(dendro, function(node) { if(attributes(node)$height > 1) @@ -265,56 +270,57 @@ findMarkers <- function(features, attributes(cellTypeDendro)$height - 1 return(node) }) - + # Adjust labels, member count, and midpoint of nodes dendro <- dendrapply(dendro, function(node){ # Check if in right branch if(cellType %in% as.character(attributes(node)$classLabels)){ # Replace cell type label with subtype labels - attributes(node)$classLabels <- + attributes(node)$classLabels <- as.character(attributes(node)$classLabels) %>% .[. != cellType] %>% c(., unique(subtypeLabels)[grep(cellType, unique(subtypeLabels))]) - + # Assign new member count for this branch attributes(node)$members <- length(attributes(node)$classLabels) - + # Assign new midpoint for this branch attributes(node)$midpoint <- (attributes(node)$members - 1) / 2 } return(node) }) - + # Replace label at new tree's branch point branchPointLabel <- attributes(eval(parse( text = paste0("dendro", indices[[cellType]]))))$label branchPointStatUsed <- attributes(eval(parse( text = paste0("dendro", indices[[cellType]]))))$statUsed - + if(!is.null(branchPointLabel)) { attributes(cellTypeDendro)$label <- branchPointLabel attributes(cellTypeDendro)$statUsed <- branchPointStatUsed } - + # Add new tree to original tree - eval(parse(text = paste0("dendro", indices[[cellType]], " <- cellTypeDendro"))) - + eval(parse(text = paste0( + "dendro", indices[[cellType]], " <- cellTypeDendro"))) + # Append new tree's 'rules' tables to original tree tree$rules <- append(tree$rules, newTrees[[cellType]]$rules) - + # Remove old tree's rules tree$rules <- tree$rules[-which(names(tree$rules) == cellType)] } - + # Set final tree dendro tree$dendro <- dendro - + # Get performance metrics print('Computing performance metrics...') perfList <- .getPerformance(tree$rules, features, as.factor(subtypeLabels)) tree$prediction <- perfList$prediction tree$performance <- perfList$performance - + return(tree) } } From 3830cdbccfe8e4fec2a3df2a75960ae4a300c48b Mon Sep 17 00:00:00 2001 From: ericreed Date: Fri, 14 Jun 2019 15:49:51 -0400 Subject: [PATCH 027/149] Fixed the tree height issues --- R/findMarkers.R | 417 ++++++++++++++++++++++++++---------------------- R/plotDendro.R | 16 +- 2 files changed, 240 insertions(+), 193 deletions(-) diff --git a/R/findMarkers.R b/R/findMarkers.R index 70623d82..677c2fe4 100644 --- a/R/findMarkers.R +++ b/R/findMarkers.R @@ -92,26 +92,26 @@ findMarkers <- function(features, altSplit = TRUE, consecutiveOneoff = TRUE) { - if (ncol(features) != length(class)) { - stop("Number of columns of features must equal length of class") - } + if (ncol(features) != length(class)) { + stop("Number of columns of features must equal length of class") + } - if (any(is.na(class))) { - stop("NA class values") - } + if (any(is.na(class))) { + stop("NA class values") + } - if (any(is.na(features))){ - stop("NA feature values") - } + if (any(is.na(features))){ + stop("NA feature values") + } - # Match the oneoffMetric argument - oneoffMetric <- match.arg(oneoffMetric) + # Match the oneoffMetric argument + oneoffMetric <- match.arg(oneoffMetric) - # Transpose features - features <- t(features) + # Transpose features + features <- t(features) - # If no detailed cell types are provided - if(!hasArg(cellTypes)){ + # If no detailed cell types are provided + if(!hasArg(cellTypes)){ print('Building tree...') @@ -120,16 +120,16 @@ findMarkers <- function(features, # Generate list of tree levels tree <- .generateTreeList( - features, - class, - oneoffMetric, - threshold, - reuseFeatures, - consecutiveOneoff) + features, + class, + oneoffMetric, + threshold, + reuseFeatures, + consecutiveOneoff) # Add alternative node for the solely down-regulated leaf if (altSplit) { - tree <- .addAlternativeSplit(tree, features, class) + tree <- .addAlternativeSplit(tree, features, class) } print('Computing performance metrics...') @@ -138,189 +138,224 @@ findMarkers <- function(features, DTsummary <- .summarizeTree(tree, features, class) return(DTsummary) - } else { - # If detailed cell types are provided - # Check that cell types match class labels - if(mean(unlist(cellTypes) %in% unique(class)) != 1) { - stop("Provided cell types do not match class labels. - Please check the 'cellTypes' argument.") - } + } else { + # If detailed cell types are provided - # Create vector with cell type class labels - newLabels <- class - for (i in names(cellTypes)) { - newLabels[newLabels %in% cellTypes[[i]]] <- i - } + # Check that cell types match class labels + if(mean(unlist(cellTypes) %in% unique(class)) != 1) { + stop("Provided cell types do not match class labels. + Please check the 'cellTypes' argument.") + } - # Update cell subtype labels - subtypeLabels <- newLabels - subtypeLabels[subtypeLabels %in% names(cellTypes)] <- paste0( - subtypeLabels[subtypeLabels %in% names(cellTypes)], - "(", - class[subtypeLabels %in% names(cellTypes)], - ")" - ) - - # Create tree for cell types - print('Building tree for all cell types...') - tree <- .generateTreeList(features, as.factor(newLabels), oneoffMetric, - threshold, reuseFeatures, consecutiveOneoff) - tree <- list( - rules = .mapClass2features(tree, features, as.factor(newLabels))$rules, - dendro = .convertToDendrogram(tree, as.factor(newLabels)) - ) - - # Store tree's dendrogram in a separate variable - dendro <- tree$dendro - - # Find which cell types have more than one cluster - largeCellTypes <- names(cellTypes[lengths(cellTypes) > 1]) - - # Create separate trees for each cell type with more than one cluster - newTrees <- lapply(unique(newLabels), function(cellType){ - - # Create tree for this cell type only - if (cellType %in% largeCellTypes) { - - # Print current status - print(paste('Building tree for cell type:', cellType)) - - # Remove used features - featUse <- colnames(features) - if (!reuseFeatures) { - featUse <- featUse[!featUse %in% tree$rules[[cellType]]$feature] + # Create vector with cell type class labels + newLabels <- class + for (i in names(cellTypes)) { + newLabels[newLabels %in% cellTypes[[i]]] <- i } - # Create new tree - newTree <- .generateTreeList(features[newLabels == cellType, featUse], - as.factor(subtypeLabels[ - newLabels == cellType]), - oneoffMetric, threshold, - reuseFeatures, consecutiveOneoff) - newTree <- list( - rules = .mapClass2features(newTree, - features[newLabels == cellType,], - as.factor(subtypeLabels[ - newLabels == cellType]))$rules, - dendro = .convertToDendrogram(newTree, - as.factor(subtypeLabels[ - newLabels == cellType])) + # Update cell subtype labels + subtypeLabels <- newLabels + subtypeLabels[subtypeLabels %in% names(cellTypes)] <- paste0( + subtypeLabels[subtypeLabels %in% names(cellTypes)], + "(", + class[subtypeLabels %in% names(cellTypes)], + ")" ) - # Adjust 'rules' table for new tree - newTree$rules <- lapply(newTree$rules, function(rules){ - rules$level <- rules$level + max(tree$rules[[cellType]]$level) - rules <- rbind(tree$rules[[cellType]], rules) - }) + # Create tree for cell types + print('Building tree for all cell types...') + tree <- .generateTreeList(features, as.factor(newLabels), oneoffMetric, + threshold, reuseFeatures, consecutiveOneoff) + tree <- list( + rules = .mapClass2features(tree, features, as.factor(newLabels))$rules, + dendro = .convertToDendrogram(tree, as.factor(newLabels)) + ) - return(newTree) - } - else{ - - # Adjust name of class if it contains only one cluster - names(tree$rules)[which(names(tree$rules) == cellType)] <<- paste0( - cellType, '(', unlist(cellTypes[cellType]),')') - return() - } - }) - - # Remove empty trees for single-cluster cell types - newTrees <- newTrees[lengths(newTrees) > 0] - names(newTrees) <- unique(newLabels)[unique(newLabels) %in% largeCellTypes] - - # Find indices of cell type nodes in tree - indices <- lapply(unique(newLabels)[unique(newLabels) %in% largeCellTypes], - function(cellType) { - # Initialize sub trees, indices string, and flag - dendSub <- dendro - index <- "" - flag <- TRUE - - while (flag) { - # Get the edge with the class of interest - whEdge <- which(unlist(lapply(dendSub, function(edge) - cellType %in% attributes(edge)$classLabels))) - - # Add this as a string - index <- paste0(index, "[[", whEdge, "]]") - - # Move to this branch - dendSub <- eval(parse(text = paste0("dendro", index))) - - # Is this the only class in that branch - flag <- length(attributes(dendSub)$classLabels) > 1 - } - - return(index) - } - ) - names(indices) <- unique(newLabels)[unique(newLabels) %in% largeCellTypes] - - # Add each cell type tree - for(cellType in names(newTrees)){ - - # Get current tree - cellTypeDendro <- newTrees[[cellType]]$dendro - - # Nudge nodes upward to make room for new tree - dendro <- dendrapply(dendro, function(node) { - if(attributes(node)$height > 1) - attributes(node)$height <- attributes(node)$height + - attributes(cellTypeDendro)$height - 1 - return(node) - }) - - # Adjust labels, member count, and midpoint of nodes - dendro <- dendrapply(dendro, function(node){ - # Check if in right branch - if(cellType %in% as.character(attributes(node)$classLabels)){ - # Replace cell type label with subtype labels - attributes(node)$classLabels <- - as.character(attributes(node)$classLabels) %>% - .[. != cellType] %>% - c(., unique(subtypeLabels)[grep(cellType, unique(subtypeLabels))]) - - # Assign new member count for this branch - attributes(node)$members <- length(attributes(node)$classLabels) - - # Assign new midpoint for this branch - attributes(node)$midpoint <- (attributes(node)$members - 1) / 2 - } - return(node) - }) + # Store tree's dendrogram in a separate variable + dendro <- tree$dendro - # Replace label at new tree's branch point - branchPointLabel <- attributes(eval(parse( - text = paste0("dendro", indices[[cellType]]))))$label - branchPointStatUsed <- attributes(eval(parse( - text = paste0("dendro", indices[[cellType]]))))$statUsed + # Find which cell types have more than one cluster + largeCellTypes <- names(cellTypes[lengths(cellTypes) > 1]) - if(!is.null(branchPointLabel)) { - attributes(cellTypeDendro)$label <- branchPointLabel - attributes(cellTypeDendro)$statUsed <- branchPointStatUsed - } + # Create separate trees for each cell type with more than one cluster + newTrees <- lapply(unique(newLabels), function(cellType){ - # Add new tree to original tree - eval(parse(text = paste0( - "dendro", indices[[cellType]], " <- cellTypeDendro"))) + # Create tree for this cell type only + if (cellType %in% largeCellTypes) { - # Append new tree's 'rules' tables to original tree - tree$rules <- append(tree$rules, newTrees[[cellType]]$rules) + # Print current status + print(paste('Building tree for cell type:', cellType)) - # Remove old tree's rules - tree$rules <- tree$rules[-which(names(tree$rules) == cellType)] - } + # Remove used features + featUse <- colnames(features) + if (!reuseFeatures) { + featUse <- featUse[!featUse %in% tree$rules[[cellType]]$feature] + } + + # Create new tree + newTree <- .generateTreeList(features[newLabels == cellType, + featUse], + as.factor(subtypeLabels[ + newLabels == cellType]), + oneoffMetric, threshold, + reuseFeatures, consecutiveOneoff) + newTree <- list( + rules = .mapClass2features(newTree, + features[newLabels == cellType,], + as.factor(subtypeLabels[ + newLabels == cellType]))$rules, + dendro = .convertToDendrogram(newTree, + as.factor(subtypeLabels[ + newLabels == cellType])) + ) + + # Adjust 'rules' table for new tree + newTree$rules <- lapply(newTree$rules, function(rules){ + rules$level <- rules$level + max(tree$rules[[cellType]]$level) + rules <- rbind(tree$rules[[cellType]], rules) + }) + + return(newTree) + } else { + # Adjust name of class if it contains only one cluster + names(tree$rules)[which(names(tree$rules) == cellType)] <<- paste0( + cellType, '(', unlist(cellTypes[cellType]),')') + return() + } + }) - # Set final tree dendro - tree$dendro <- dendro + # Fix max depth in original tree + maxDepth <- max(unlist(lapply(newTrees, function(newTree) { + lapply(newTree$rules, function(ruleDF) { + ruleDF$level + }) + }))) + addDepth <- maxDepth - attributes(dendro)$height + + dendro <- dendrapply(dendro, function(node, addDepth) { + if(attributes(node)$height > 1){ + attributes(node)$height <- + attributes(node)$height + addDepth + 1 + } + return(node) + }, addDepth) + + # Remove empty trees for single-cluster cell types + newTrees <- newTrees[lengths(newTrees) > 0] + names(newTrees) <- unique(newLabels)[ + unique(newLabels) %in% largeCellTypes] + + # Find indices of cell type nodes in tree + indices <- lapply(unique(newLabels)[ + unique(newLabels) %in% largeCellTypes], + function(cellType) { + # Initialize sub trees, indices string, and flag + dendSub <- dendro + index <- "" + flag <- TRUE + + while (flag) { + # Get the edge with the class of interest + whEdge <- which(unlist(lapply(dendSub, function(edge) + cellType %in% attributes(edge)$classLabels))) + + # Add this as a string + index <- paste0(index, "[[", whEdge, "]]") + + # Move to this branch + dendSub <- eval(parse(text = paste0("dendro", index))) + + # Is this the only class in that branch + flag <- length(attributes(dendSub)$classLabels) > 1 + } + + return(index) + }) + names(indices) <- unique(newLabels)[ + unique(newLabels) %in% largeCellTypes] + + # Add each cell type tree + for (cellType in names(newTrees)) { + + # Get current tree + cellTypeDendro <- newTrees[[cellType]]$dendro + + # Adjust labels, member count, and midpoint of nodes + dendro <- dendrapply(dendro, function(node){ + # Check if in right branch + if(cellType %in% as.character(attributes(node)$classLabels)){ + # Replace cell type label with subtype labels + attributes(node)$classLabels <- + as.character(attributes(node)$classLabels) %>% + .[. != cellType] %>% + c(., unique(subtypeLabels)[grep(cellType, + unique(subtypeLabels))]) + + # Assign new member count for this branch + attributes(node)$members <- + length(attributes(node)$classLabels) + + # Assign new midpoint for this branch + attributes(node)$midpoint <- + (attributes(node)$members - 1) / 2 + } + return(node) + }) + + # Replace label at new tree's branch point + branchPointAttr <- attributes(eval(parse( + text = paste0("dendro", indices[[cellType]])))) + branchPointLabel <- branchPointAttr$label + branchPointStatUsed <- branchPointAttr$statUsed + + if(!is.null(branchPointLabel)) { + attributes(cellTypeDendro)$label <- branchPointLabel + attributes(cellTypeDendro)$statUsed <- branchPointStatUsed + } + + # Fix height + indLoc <- gregexpr("\\[\\[", indices[[cellType]])[[1]] + indLoc <- indLoc[length(indLoc)] + parentIndexString <- substr(indices[[cellType]], + 0, + indLoc-1) + parentHeight <- attributes(eval(parse( + text = paste0("dendro", parentIndexString))))$height + cellTypeHeight <- attributes(cellTypeDendro)$height + cellTypeDendro <- dendrapply(cellTypeDendro, + function(node, + parentHeight, + cellTypeHeight) { + if(attributes(node)$height > 1){ + attributes(node)$height <- parentHeight - 1 - + (cellTypeHeight - attributes(node)$height) + } + return(node) + }, parentHeight, cellTypeHeight) + + # Add new tree to original tree + eval(parse(text = paste0( + "dendro", indices[[cellType]], " <- cellTypeDendro"))) + + # Append new tree's 'rules' tables to original tree + tree$rules <- append(tree$rules, newTrees[[cellType]]$rules) + + # Remove old tree's rules + tree$rules <- tree$rules[-which(names(tree$rules) == cellType)] + } - # Get performance metrics - print('Computing performance metrics...') - perfList <- .getPerformance(tree$rules, features, as.factor(subtypeLabels)) - tree$prediction <- perfList$prediction - tree$performance <- perfList$performance + # Set final tree dendro + tree$dendro <- dendro - return(tree) - } + # Get performance metrics + print('Computing performance metrics...') + perfList <- .getPerformance(tree$rules, + features, + as.factor(subtypeLabels)) + tree$prediction <- perfList$prediction + tree$performance <- perfList$performance + + return(tree) + } } diff --git a/R/plotDendro.R b/R/plotDendro.R index be4a85fa..3ea9a975 100644 --- a/R/plotDendro.R +++ b/R/plotDendro.R @@ -6,6 +6,8 @@ #' the path and rules. If NULL (default), the rules for every cluster is shown. #' @param addSensPrec Logical. Print training sensitivities and precisions #' for each cluster below leaf label? Default is FALSE. +#' @param maxFeaturePrint A numeric value. Maximum number of feature IDs to print +#' at a given node. Default is 4. #' @param leafSize A numeric value. Size of text below each leaf. Default is 24. #' @param boxSize A numeric value. Size of rule labels. Default is 7. #' @param boxColor A character value. Color of rule labels. Default is `black`. @@ -37,6 +39,7 @@ plotDendro <- function(decisionTree, classLabel = NULL, addSensPrec = FALSE, + maxFeaturePrint = 4, leafSize = 24, boxSize = 7, boxColor = "black") { @@ -75,9 +78,18 @@ plotDendro <- function(decisionTree, segs <- as.data.frame(dendextend::get_nodes_xy(dendro)) colnames(segs) <- c("xend", "yend") - # As label and which stat was used - # Labels will stack + # Add labels to nodes segs$label <- gsub(";", "\n", dendextend::get_nodes_attr(dendro, "label")) + segs$label <- sapply(segs$label, function(lab, maxFeaturePrint) { + loc <- gregexpr("\n", lab)[[1]][maxFeaturePrint] + if(!is.na(loc)) { + lab <- substr(lab, 1, loc-2) + } + return(lab) + }, maxFeaturePrint) + + # Subset for max + segs$statUsed <- dendextend::get_nodes_attr(dendro, "statUsed") # If highlighting a class label, remove non-class specific rules From 083717a09973bcec87029e488a3af39d6c8f400c Mon Sep 17 00:00:00 2001 From: ericreed Date: Thu, 20 Jun 2019 13:01:31 -0400 Subject: [PATCH 028/149] Fixed bug in setting the max features to print --- R/plotDendro.R | 162 ++++++++++++++++++++++++------------------------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/R/plotDendro.R b/R/plotDendro.R index 3ea9a975..28e1e580 100644 --- a/R/plotDendro.R +++ b/R/plotDendro.R @@ -37,34 +37,34 @@ #' @importFrom dendextend get_nodes_xy get_nodes_attr get_leaves_attr #' @export plotDendro <- function(decisionTree, - classLabel = NULL, - addSensPrec = FALSE, - maxFeaturePrint = 4, - leafSize = 24, - boxSize = 7, - boxColor = "black") { - + classLabel = NULL, + addSensPrec = FALSE, + maxFeaturePrint = 4, + leafSize = 24, + boxSize = 7, + boxColor = "black") { + # Get necessary elements dendro <- decisionTree$dendro - + # Get performance information (training or CV based) performance <- decisionTree$performance - + # Create vector of per class performance perfVec <- paste(performance$sizes, - format(round(performance$sensitivity, 2), nsmall = 2), - format(round(performance$precision, 2), nsmall = 2), - sep = "\n" + format(round(performance$sensitivity, 2), nsmall = 2), + format(round(performance$precision, 2), nsmall = 2), + sep = "\n" ) names(perfVec) <- names(performance$sensitivity) - + # Get dendrogram segments dendSegs <- ggdendro::dendro_data(dendro, type = "rectangle")$segments - + # Get necessary coordinates to add labels to # These will have y > 1 dendSegs <- unique(dendSegs[dendSegs$y > 1, c("x", "y", "yend", "xend")]) - + # Labeled splits will be vertical (x != xend) or # Length 0 (x == xend & y == yend) dendSegsAlt <- dendSegs[ @@ -72,26 +72,26 @@ plotDendro <- function(decisionTree, (dendSegs$x == dendSegs$xend & dendSegs$y == dendSegs$yend), c("x", "xend", "y")] colnames(dendSegsAlt)[1] <- "xalt" - + # Label names will be at nodes, these will # Occur at the end of segments segs <- as.data.frame(dendextend::get_nodes_xy(dendro)) colnames(segs) <- c("xend", "yend") - + # Add labels to nodes segs$label <- gsub(";", "\n", dendextend::get_nodes_attr(dendro, "label")) segs$label <- sapply(segs$label, function(lab, maxFeaturePrint) { - loc <- gregexpr("\n", lab)[[1]][maxFeaturePrint] - if(!is.na(loc)) { - lab <- substr(lab, 1, loc-2) - } - return(lab) + loc <- gregexpr("\n", lab)[[1]][maxFeaturePrint] + if(!is.na(loc)) { + lab <- substr(lab, 1, loc-1) + } + return(lab) }, maxFeaturePrint) # Subset for max segs$statUsed <- dendextend::get_nodes_attr(dendro, "statUsed") - + # If highlighting a class label, remove non-class specific rules if (!is.null(classLabel)) { if (!classLabel %in% names(decisionTree$rules)) { @@ -102,40 +102,40 @@ plotDendro <- function(decisionTree, keepLabel[is.na(keepLabel)] <- FALSE segs$label[!keepLabel] <- NA } - + # Remove non-labelled nodes & # leaf nodes (yend == 0) segs <- segs[!is.na(segs$label) & segs$yend != 0, ] - + # Merge to full set of coordinates dendSegsLabelled <- merge(dendSegs, segs) - + # Remove duplicated labels dendSegsLabelled <- dendSegsLabelled[order(dendSegsLabelled$y, - decreasing = T), ] + decreasing = T), ] dendSegsLabelled <- dendSegsLabelled[ !duplicated(dendSegsLabelled[, - c("xend", "x", "yend", "label", "statUsed")]), ] - + c("xend", "x", "yend", "label", "statUsed")]), ] + # Merge with alternative x-coordinates for alternative split dendSegsLabelled <- merge(dendSegsLabelled, dendSegsAlt) - + # Order by height and coordinates dendSegsLabelled <- dendSegsLabelled[order(dendSegsLabelled$x), ] - + # Find information gain splits igSplits <- dendSegsLabelled$statUsed == "IG" & !duplicated(dendSegsLabelled[, c("xalt", "y")]) - + # Set xend for IG splits dendSegsLabelled$xend[igSplits] <- dendSegsLabelled$xalt[igSplits] - + # Set y for non-IG splits dendSegsLabelled$y[!igSplits] <- dendSegsLabelled$y[!igSplits] - 0.2 - + # Get index of leaf labels leafLabels <- dendextend::get_leaves_attr(dendro, "label") - + # Add sensitivity and precision measurements if (addSensPrec) { leafLabels <- paste(leafLabels, perfVec[leafLabels], sep = "\n") @@ -147,61 +147,61 @@ plotDendro <- function(decisionTree, leafHJust <- 1 leafVJust <- 0.5 } - + # Create plot of dendrogram suppressMessages(dendroP <- ggdendro::ggdendrogram(dendro) + - ggplot2::geom_label( - data = dendSegsLabelled, - ggplot2::aes(x = xend, y = y, label = label), - size = boxSize, - label.size = 1, - fontface = "bold", - vjust = 1, - nudge_y = 0.1, - color = boxColor) + - ggplot2::theme_bw() + - ggplot2::scale_x_reverse(breaks = seq(length(leafLabels)), - label = leafLabels) + - ggplot2::scale_y_continuous(expand = c(0, 0)) + - ggplot2::theme( - panel.grid.major.y = ggplot2::element_blank(), - legend.position = "none", - panel.grid.minor.y = ggplot2::element_blank(), - panel.grid.minor.x = ggplot2::element_blank(), - panel.grid.major.x = ggplot2::element_blank(), - panel.border = ggplot2::element_blank(), - axis.title = ggplot2::element_blank(), - axis.ticks = ggplot2::element_blank(), - axis.text.x = ggplot2::element_text( - hjust = leafHJust, - angle = leafAngle, - size = leafSize, - family = "mono", - vjust = leafVJust), - axis.text.y = ggplot2::element_blank() - )) - + ggplot2::geom_label( + data = dendSegsLabelled, + ggplot2::aes(x = xend, y = y, label = label), + size = boxSize, + label.size = 1, + fontface = "bold", + vjust = 1, + nudge_y = 0.1, + color = boxColor) + + ggplot2::theme_bw() + + ggplot2::scale_x_reverse(breaks = seq(length(leafLabels)), + label = leafLabels) + + ggplot2::scale_y_continuous(expand = c(0, 0)) + + ggplot2::theme( + panel.grid.major.y = ggplot2::element_blank(), + legend.position = "none", + panel.grid.minor.y = ggplot2::element_blank(), + panel.grid.minor.x = ggplot2::element_blank(), + panel.grid.major.x = ggplot2::element_blank(), + panel.border = ggplot2::element_blank(), + axis.title = ggplot2::element_blank(), + axis.ticks = ggplot2::element_blank(), + axis.text.x = ggplot2::element_text( + hjust = leafHJust, + angle = leafAngle, + size = leafSize, + family = "mono", + vjust = leafVJust), + axis.text.y = ggplot2::element_blank() + )) + # Increase line width slightly for aesthetic purposes dendroP$layers[[2]]$aes_params$size <- 1.3 - + return(dendroP) } # Function to reformat the dendrogram to draw path to a specific class .highlightClassLabel <- function(dendro, classLabel) { - + # Reorder dendrogram flag <- TRUE bIndexString <- "" - + # Get branch branch <- eval(parse(text = paste0("dendro", bIndexString))) - + while (flag) { - + # Get attributes att <- attributes(branch) - + # Get split with the label of interest labList <- lapply(branch, function(split) attributes(split)$classLabels) wSplit <- which(unlist(lapply( @@ -209,29 +209,29 @@ plotDendro <- function(decisionTree, function(vec) { classLabel %in% vec }))) - + # Keep labels for this branch branch <- lapply(branch, function(edge) { attributes(edge)$keepLabel <- TRUE return(edge) }) - + # Make a dendrogram class again class(branch) <- "dendrogram" attributes(branch) <- att - + # Add branch to dendro eval(parse(text = paste0("dendro", bIndexString, "<- branch"))) - + # Create new bIndexString bIndexString <- paste0(bIndexString, "[[", wSplit, "]]") - + # Get branch branch <- eval(parse(text = paste0("dendro", bIndexString))) - + # Add flag flag <- attributes(branch)$members > 1 } - + return(dendro) -} +} \ No newline at end of file From 0c1325bac954f5b3fa5c351e890444131ebde1fd Mon Sep 17 00:00:00 2001 From: ericreed Date: Fri, 5 Jul 2019 12:31:10 -0400 Subject: [PATCH 029/149] Fixed leaf and rule labelling discrpency issue --- R/findMarkers.R | 318 ++++++++++++++++++++++-------------------------- 1 file changed, 148 insertions(+), 170 deletions(-) diff --git a/R/findMarkers.R b/R/findMarkers.R index 677c2fe4..06c95344 100644 --- a/R/findMarkers.R +++ b/R/findMarkers.R @@ -91,140 +91,130 @@ findMarkers <- function(features, reuseFeatures = FALSE, altSplit = TRUE, consecutiveOneoff = TRUE) { - + if (ncol(features) != length(class)) { stop("Number of columns of features must equal length of class") } - + if (any(is.na(class))) { stop("NA class values") } - + if (any(is.na(features))){ stop("NA feature values") } - + # Match the oneoffMetric argument oneoffMetric <- match.arg(oneoffMetric) - + # Transpose features features <- t(features) - + # If no detailed cell types are provided if(!hasArg(cellTypes)){ - - print('Building tree...') - - # Set class to factor - class <- as.factor(class) - - # Generate list of tree levels - tree <- .generateTreeList( - features, - class, - oneoffMetric, - threshold, - reuseFeatures, - consecutiveOneoff) - - # Add alternative node for the solely down-regulated leaf - if (altSplit) { - tree <- .addAlternativeSplit(tree, features, class) - } - - print('Computing performance metrics...') - - # Format tree output for plotting and generate summary statistics - DTsummary <- .summarizeTree(tree, features, class) - - return(DTsummary) - + + print('Building tree...') + + # Set class to factor + class <- as.factor(class) + + # Generate list of tree levels + tree <- .generateTreeList( + features, + class, + oneoffMetric, + threshold, + reuseFeatures, + consecutiveOneoff) + + # Add alternative node for the solely down-regulated leaf + if (altSplit) { + tree <- .addAlternativeSplit(tree, features, class) + } + + print('Computing performance metrics...') + + # Format tree output for plotting and generate summary statistics + DTsummary <- .summarizeTree(tree, features, class) + + return(DTsummary) } else { # If detailed cell types are provided - + # Check that cell types match class labels if(mean(unlist(cellTypes) %in% unique(class)) != 1) { stop("Provided cell types do not match class labels. - Please check the 'cellTypes' argument.") + Please check the 'cellTypes' argument.") } - + # Create vector with cell type class labels newLabels <- class for (i in names(cellTypes)) { newLabels[newLabels %in% cellTypes[[i]]] <- i } - + + # Find which cell types have more than one cluster + largeCellTypes <- names(cellTypes[lengths(cellTypes) > 1]) + # Update cell subtype labels subtypeLabels <- newLabels - subtypeLabels[subtypeLabels %in% names(cellTypes)] <- paste0( - subtypeLabels[subtypeLabels %in% names(cellTypes)], + subtypeLabels[subtypeLabels %in% largeCellTypes] <- paste0( + subtypeLabels[subtypeLabels %in% largeCellTypes], "(", - class[subtypeLabels %in% names(cellTypes)], + class[subtypeLabels %in% largeCellTypes], ")" ) - + # Create tree for cell types print('Building tree for all cell types...') tree <- .generateTreeList(features, as.factor(newLabels), oneoffMetric, - threshold, reuseFeatures, consecutiveOneoff) + threshold, reuseFeatures, consecutiveOneoff) tree <- list( rules = .mapClass2features(tree, features, as.factor(newLabels))$rules, dendro = .convertToDendrogram(tree, as.factor(newLabels)) ) - + # Store tree's dendrogram in a separate variable dendro <- tree$dendro - - # Find which cell types have more than one cluster - largeCellTypes <- names(cellTypes[lengths(cellTypes) > 1]) - + # Create separate trees for each cell type with more than one cluster - newTrees <- lapply(unique(newLabels), function(cellType){ - - # Create tree for this cell type only - if (cellType %in% largeCellTypes) { - - # Print current status - print(paste('Building tree for cell type:', cellType)) - - # Remove used features - featUse <- colnames(features) - if (!reuseFeatures) { - featUse <- featUse[!featUse %in% tree$rules[[cellType]]$feature] - } - - # Create new tree - newTree <- .generateTreeList(features[newLabels == cellType, - featUse], - as.factor(subtypeLabels[ - newLabels == cellType]), - oneoffMetric, threshold, - reuseFeatures, consecutiveOneoff) - newTree <- list( - rules = .mapClass2features(newTree, - features[newLabels == cellType,], - as.factor(subtypeLabels[ - newLabels == cellType]))$rules, - dendro = .convertToDendrogram(newTree, - as.factor(subtypeLabels[ - newLabels == cellType])) - ) - - # Adjust 'rules' table for new tree - newTree$rules <- lapply(newTree$rules, function(rules){ - rules$level <- rules$level + max(tree$rules[[cellType]]$level) - rules <- rbind(tree$rules[[cellType]], rules) - }) - - return(newTree) - } else { - # Adjust name of class if it contains only one cluster - names(tree$rules)[which(names(tree$rules) == cellType)] <<- paste0( - cellType, '(', unlist(cellTypes[cellType]),')') - return() + newTrees <- lapply(largeCellTypes, function(cellType){ + + # Print current status + print(paste('Building tree for cell type:', cellType)) + + # Remove used features + featUse <- colnames(features) + if (!reuseFeatures) { + featUse <- featUse[!featUse %in% tree$rules[[cellType]]$feature] } + + # Create new tree + newTree <- .generateTreeList(features[newLabels == cellType, featUse], + as.factor(subtypeLabels[ + newLabels == cellType]), + oneoffMetric, threshold, + reuseFeatures, consecutiveOneoff) + newTree <- list( + rules = .mapClass2features(newTree, + features[newLabels == cellType,], + as.factor(subtypeLabels[ + newLabels == cellType]))$rules, + dendro = .convertToDendrogram(newTree, + as.factor(subtypeLabels[ + newLabels == cellType])) + ) + + # Adjust 'rules' table for new tree + newTree$rules <- lapply(newTree$rules, function(rules){ + rules$level <- rules$level + max(tree$rules[[cellType]]$level) + rules <- rbind(tree$rules[[cellType]], rules) + }) + + return(newTree) }) - + names(newTrees) <- largeCellTypes + # Fix max depth in original tree maxDepth <- max(unlist(lapply(newTrees, function(newTree) { lapply(newTree$rules, function(ruleDF) { @@ -232,55 +222,48 @@ findMarkers <- function(features, }) }))) addDepth <- maxDepth - attributes(dendro)$height - + dendro <- dendrapply(dendro, function(node, addDepth) { if(attributes(node)$height > 1){ - attributes(node)$height <- - attributes(node)$height + addDepth + 1 + attributes(node)$height <- attributes(node)$height + addDepth + 1 } return(node) }, addDepth) - - # Remove empty trees for single-cluster cell types - newTrees <- newTrees[lengths(newTrees) > 0] - names(newTrees) <- unique(newLabels)[ - unique(newLabels) %in% largeCellTypes] - + # Find indices of cell type nodes in tree - indices <- lapply(unique(newLabels)[ - unique(newLabels) %in% largeCellTypes], - function(cellType) { - # Initialize sub trees, indices string, and flag - dendSub <- dendro - index <- "" - flag <- TRUE - - while (flag) { - # Get the edge with the class of interest - whEdge <- which(unlist(lapply(dendSub, function(edge) - cellType %in% attributes(edge)$classLabels))) - - # Add this as a string - index <- paste0(index, "[[", whEdge, "]]") - - # Move to this branch - dendSub <- eval(parse(text = paste0("dendro", index))) - - # Is this the only class in that branch - flag <- length(attributes(dendSub)$classLabels) > 1 - } - - return(index) - }) - names(indices) <- unique(newLabels)[ - unique(newLabels) %in% largeCellTypes] - + indices <- lapply(largeCellTypes, + function(cellType) { + # Initialize sub trees, indices string, and flag + dendSub <- dendro + index <- "" + flag <- TRUE + + while (flag) { + # Get the edge with the class of interest + whEdge <- which(unlist(lapply(dendSub, function(edge) + cellType %in% attributes(edge)$classLabels))) + + # Add this as a string + index <- paste0(index, "[[", whEdge, "]]") + + # Move to this branch + dendSub <- eval(parse(text = paste0("dendro", index))) + + # Is this the only class in that branch + flag <- length(attributes(dendSub)$classLabels) > 1 + } + + return(index) + } + ) + names(indices) <- largeCellTypes + # Add each cell type tree - for (cellType in names(newTrees)) { - + for(cellType in largeCellTypes){ + # Get current tree cellTypeDendro <- newTrees[[cellType]]$dendro - + # Adjust labels, member count, and midpoint of nodes dendro <- dendrapply(dendro, function(node){ # Check if in right branch @@ -289,31 +272,28 @@ findMarkers <- function(features, attributes(node)$classLabels <- as.character(attributes(node)$classLabels) %>% .[. != cellType] %>% - c(., unique(subtypeLabels)[grep(cellType, - unique(subtypeLabels))]) - + c(., unique(subtypeLabels)[grep(cellType, unique(subtypeLabels))]) + # Assign new member count for this branch - attributes(node)$members <- - length(attributes(node)$classLabels) - + attributes(node)$members <- length(attributes(node)$classLabels) + # Assign new midpoint for this branch - attributes(node)$midpoint <- - (attributes(node)$members - 1) / 2 + attributes(node)$midpoint <- (attributes(node)$members - 1) / 2 } return(node) }) - + # Replace label at new tree's branch point branchPointAttr <- attributes(eval(parse( - text = paste0("dendro", indices[[cellType]])))) + text = paste0("dendro", indices[[cellType]])))) branchPointLabel <- branchPointAttr$label branchPointStatUsed <- branchPointAttr$statUsed - - if(!is.null(branchPointLabel)) { - attributes(cellTypeDendro)$label <- branchPointLabel - attributes(cellTypeDendro)$statUsed <- branchPointStatUsed - } - + + if(!is.null(branchPointLabel)) { + attributes(cellTypeDendro)$label <- branchPointLabel + attributes(cellTypeDendro)$statUsed <- branchPointStatUsed + } + # Fix height indLoc <- gregexpr("\\[\\[", indices[[cellType]])[[1]] indLoc <- indLoc[length(indLoc)] @@ -323,39 +303,37 @@ findMarkers <- function(features, parentHeight <- attributes(eval(parse( text = paste0("dendro", parentIndexString))))$height cellTypeHeight <- attributes(cellTypeDendro)$height - cellTypeDendro <- dendrapply(cellTypeDendro, - function(node, - parentHeight, - cellTypeHeight) { - if(attributes(node)$height > 1){ - attributes(node)$height <- parentHeight - 1 - - (cellTypeHeight - attributes(node)$height) - } - return(node) - }, parentHeight, cellTypeHeight) - + cellTypeDendro <- dendrapply(cellTypeDendro, + function(node, + parentHeight, + cellTypeHeight) { + if(attributes(node)$height > 1){ + attributes(node)$height <- parentHeight - 1 - + (cellTypeHeight - attributes(node)$height) + } + return(node) + }, parentHeight, cellTypeHeight) + # Add new tree to original tree eval(parse(text = paste0( - "dendro", indices[[cellType]], " <- cellTypeDendro"))) - + "dendro", indices[[cellType]], " <- cellTypeDendro"))) + # Append new tree's 'rules' tables to original tree tree$rules <- append(tree$rules, newTrees[[cellType]]$rules) - + # Remove old tree's rules tree$rules <- tree$rules[-which(names(tree$rules) == cellType)] } - + # Set final tree dendro tree$dendro <- dendro - - # Get performance metrics - print('Computing performance metrics...') - perfList <- .getPerformance(tree$rules, - features, - as.factor(subtypeLabels)) + + # Get performance statistics + print('Computing performance statistics...') + perfList <- .getPerformance(tree$rules, features, as.factor(subtypeLabels)) tree$prediction <- perfList$prediction tree$performance <- perfList$performance - + return(tree) - } + } } From de7e28df9ccec445f29ee1572c64e036991da37d Mon Sep 17 00:00:00 2001 From: ericreed Date: Fri, 5 Jul 2019 12:37:18 -0400 Subject: [PATCH 030/149] Fixed style issues --- R/findMarkers.R | 199 ++++++++++++++++++++++++++---------------------- 1 file changed, 107 insertions(+), 92 deletions(-) diff --git a/R/findMarkers.R b/R/findMarkers.R index 06c95344..4c271e79 100644 --- a/R/findMarkers.R +++ b/R/findMarkers.R @@ -91,33 +91,33 @@ findMarkers <- function(features, reuseFeatures = FALSE, altSplit = TRUE, consecutiveOneoff = TRUE) { - + if (ncol(features) != length(class)) { stop("Number of columns of features must equal length of class") } - + if (any(is.na(class))) { stop("NA class values") } - + if (any(is.na(features))){ stop("NA feature values") } - + # Match the oneoffMetric argument oneoffMetric <- match.arg(oneoffMetric) - + # Transpose features features <- t(features) - + # If no detailed cell types are provided if(!hasArg(cellTypes)){ - + print('Building tree...') - + # Set class to factor class <- as.factor(class) - + # Generate list of tree levels tree <- .generateTreeList( features, @@ -126,36 +126,36 @@ findMarkers <- function(features, threshold, reuseFeatures, consecutiveOneoff) - + # Add alternative node for the solely down-regulated leaf if (altSplit) { tree <- .addAlternativeSplit(tree, features, class) } - + print('Computing performance metrics...') - + # Format tree output for plotting and generate summary statistics DTsummary <- .summarizeTree(tree, features, class) - + return(DTsummary) } else { # If detailed cell types are provided - + # Check that cell types match class labels if(mean(unlist(cellTypes) %in% unique(class)) != 1) { stop("Provided cell types do not match class labels. Please check the 'cellTypes' argument.") } - + # Create vector with cell type class labels newLabels <- class for (i in names(cellTypes)) { newLabels[newLabels %in% cellTypes[[i]]] <- i } - + # Find which cell types have more than one cluster largeCellTypes <- names(cellTypes[lengths(cellTypes) > 1]) - + # Update cell subtype labels subtypeLabels <- newLabels subtypeLabels[subtypeLabels %in% largeCellTypes] <- paste0( @@ -164,57 +164,59 @@ findMarkers <- function(features, class[subtypeLabels %in% largeCellTypes], ")" ) - + # Create tree for cell types print('Building tree for all cell types...') tree <- .generateTreeList(features, as.factor(newLabels), oneoffMetric, threshold, reuseFeatures, consecutiveOneoff) tree <- list( - rules = .mapClass2features(tree, features, as.factor(newLabels))$rules, + rules = .mapClass2features(tree, features, + as.factor(newLabels))$rules, dendro = .convertToDendrogram(tree, as.factor(newLabels)) ) - + # Store tree's dendrogram in a separate variable dendro <- tree$dendro - + # Create separate trees for each cell type with more than one cluster newTrees <- lapply(largeCellTypes, function(cellType){ - + # Print current status print(paste('Building tree for cell type:', cellType)) - + # Remove used features featUse <- colnames(features) if (!reuseFeatures) { featUse <- featUse[!featUse %in% tree$rules[[cellType]]$feature] } - + # Create new tree - newTree <- .generateTreeList(features[newLabels == cellType, featUse], - as.factor(subtypeLabels[ - newLabels == cellType]), - oneoffMetric, threshold, - reuseFeatures, consecutiveOneoff) + newTree <- .generateTreeList(features[newLabels == cellType, + featUse], + as.factor(subtypeLabels[ + newLabels == cellType]), + oneoffMetric, threshold, + reuseFeatures, consecutiveOneoff) newTree <- list( rules = .mapClass2features(newTree, - features[newLabels == cellType,], - as.factor(subtypeLabels[ - newLabels == cellType]))$rules, + features[newLabels == cellType,], + as.factor(subtypeLabels[ + newLabels == cellType]))$rules, dendro = .convertToDendrogram(newTree, - as.factor(subtypeLabels[ - newLabels == cellType])) + as.factor(subtypeLabels[ + newLabels == cellType])) ) - + # Adjust 'rules' table for new tree newTree$rules <- lapply(newTree$rules, function(rules){ rules$level <- rules$level + max(tree$rules[[cellType]]$level) rules <- rbind(tree$rules[[cellType]], rules) }) - + return(newTree) }) names(newTrees) <- largeCellTypes - + # Fix max depth in original tree maxDepth <- max(unlist(lapply(newTrees, function(newTree) { lapply(newTree$rules, function(ruleDF) { @@ -222,48 +224,53 @@ findMarkers <- function(features, }) }))) addDepth <- maxDepth - attributes(dendro)$height - + dendro <- dendrapply(dendro, function(node, addDepth) { if(attributes(node)$height > 1){ - attributes(node)$height <- attributes(node)$height + addDepth + 1 + attributes(node)$height <- attributes(node)$height + + addDepth + 1 } return(node) }, addDepth) - + # Find indices of cell type nodes in tree indices <- lapply(largeCellTypes, - function(cellType) { - # Initialize sub trees, indices string, and flag - dendSub <- dendro - index <- "" - flag <- TRUE - - while (flag) { - # Get the edge with the class of interest - whEdge <- which(unlist(lapply(dendSub, function(edge) - cellType %in% attributes(edge)$classLabels))) - - # Add this as a string - index <- paste0(index, "[[", whEdge, "]]") - - # Move to this branch - dendSub <- eval(parse(text = paste0("dendro", index))) - - # Is this the only class in that branch - flag <- length(attributes(dendSub)$classLabels) > 1 - } - - return(index) - } + function(cellType) { + # Initialize sub trees, indices string, and flag + dendSub <- dendro + index <- "" + flag <- TRUE + + while (flag) { + # Get the edge with the class of interest + whEdge <- which(unlist(lapply(dendSub, + function(edge) + cellType %in% + attributes(edge)$classLabels))) + + # Add this as a string + index <- paste0(index, "[[", whEdge, "]]") + + # Move to this branch + dendSub <- eval(parse(text = + paste0("dendro", index))) + + # Is this the only class in that branch + flag <- length( + attributes(dendSub)$classLabels) > 1 + } + + return(index) + } ) names(indices) <- largeCellTypes - + # Add each cell type tree for(cellType in largeCellTypes){ - + # Get current tree cellTypeDendro <- newTrees[[cellType]]$dendro - + # Adjust labels, member count, and midpoint of nodes dendro <- dendrapply(dendro, function(node){ # Check if in right branch @@ -272,28 +279,31 @@ findMarkers <- function(features, attributes(node)$classLabels <- as.character(attributes(node)$classLabels) %>% .[. != cellType] %>% - c(., unique(subtypeLabels)[grep(cellType, unique(subtypeLabels))]) - + c(., unique(subtypeLabels)[grep(cellType, + unique(subtypeLabels))]) + # Assign new member count for this branch - attributes(node)$members <- length(attributes(node)$classLabels) - + attributes(node)$members <- + length(attributes(node)$classLabels) + # Assign new midpoint for this branch - attributes(node)$midpoint <- (attributes(node)$members - 1) / 2 + attributes(node)$midpoint <- + (attributes(node)$members - 1) / 2 } return(node) }) - + # Replace label at new tree's branch point branchPointAttr <- attributes(eval(parse( text = paste0("dendro", indices[[cellType]])))) branchPointLabel <- branchPointAttr$label branchPointStatUsed <- branchPointAttr$statUsed - + if(!is.null(branchPointLabel)) { attributes(cellTypeDendro)$label <- branchPointLabel attributes(cellTypeDendro)$statUsed <- branchPointStatUsed } - + # Fix height indLoc <- gregexpr("\\[\\[", indices[[cellType]])[[1]] indLoc <- indLoc[length(indLoc)] @@ -303,37 +313,42 @@ findMarkers <- function(features, parentHeight <- attributes(eval(parse( text = paste0("dendro", parentIndexString))))$height cellTypeHeight <- attributes(cellTypeDendro)$height - cellTypeDendro <- dendrapply(cellTypeDendro, - function(node, - parentHeight, - cellTypeHeight) { - if(attributes(node)$height > 1){ - attributes(node)$height <- parentHeight - 1 - - (cellTypeHeight - attributes(node)$height) - } - return(node) - }, parentHeight, cellTypeHeight) - + cellTypeDendro <- dendrapply(cellTypeDendro, + function(node, + parentHeight, + cellTypeHeight) { + if(attributes(node)$height > 1){ + attributes(node)$height <- + parentHeight - 1 - + (cellTypeHeight - + attributes( + node)$height) + } + return(node) + }, parentHeight, cellTypeHeight) + # Add new tree to original tree eval(parse(text = paste0( "dendro", indices[[cellType]], " <- cellTypeDendro"))) - + # Append new tree's 'rules' tables to original tree tree$rules <- append(tree$rules, newTrees[[cellType]]$rules) - + # Remove old tree's rules tree$rules <- tree$rules[-which(names(tree$rules) == cellType)] } - + # Set final tree dendro tree$dendro <- dendro - + # Get performance statistics print('Computing performance statistics...') - perfList <- .getPerformance(tree$rules, features, as.factor(subtypeLabels)) + perfList <- .getPerformance(tree$rules, + features, + as.factor(subtypeLabels)) tree$prediction <- perfList$prediction tree$performance <- perfList$performance - + return(tree) - } + } } From 4654a94193cc86584c4ad53bfe22220961743581 Mon Sep 17 00:00:00 2001 From: zhewa Date: Tue, 9 Jul 2019 21:29:19 -0400 Subject: [PATCH 031/149] add magrittr import --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9cc6b688..84314712 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,7 +42,8 @@ Imports: withr, dendextend, ggdendro, - pROC + pROC, + magrittr Suggests: testthat, knitr, From bf8b8863e87bee194fe18983cfa4fee4f61a4ed8 Mon Sep 17 00:00:00 2001 From: zhewa Date: Tue, 9 Jul 2019 21:32:46 -0400 Subject: [PATCH 032/149] add documentation --- man/findMarkers.Rd | 68 ++++++++++++++++++++++++++++++---------------- man/plotDendro.Rd | 6 +++- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/man/findMarkers.Rd b/man/findMarkers.Rd index a01e3d76..d778cb3f 100644 --- a/man/findMarkers.Rd +++ b/man/findMarkers.Rd @@ -13,34 +13,56 @@ findMarkers(features, class, cellTypes, oneoffMetric = c("modified F1", \item{class}{A vector of K label assignemnts.} -\item{cellTypes}{List where each element is a cell type and all the clusters within that cell type (i.e. subtypes).} +\item{oneoffMetric}{A character string. What one-off metric to run, either +`modified F1` or `pairwise AUC`. +@param cellTypes List where each element is a cell type and all the clusters + within that cell type (i.e. subtypes).} -\item{oneoffMetric}{A character string. What one-off metric to run, either `modified F1` or `pairwise AUC`.} +\item{threshold}{A numeric value. The threshold for the oneoff metric to use +between 0 and 1, 0.95 by default. Smaller values will result is more one-off +splits.} -\item{threshold}{A numeric value. The threshold for the oneoff metric to use between 0 and 1, 0.95 by default. Smaller values will result is more one-off splits.} +\item{reuseFeatures}{Logical. Whether or not a feature can be used more than +once on the same cluster. Default is TRUE.} -\item{reuseFeatures}{Logical. Whether or not a feature can be used more than once on the same cluster. Default is TRUE.} +\item{altSplit}{Logical. Whether or not to force a marker for clusters that +are solely defined by the absence of markers. Defulault is TRUE} -\item{altSplit}{Logical. Whether or not to force a marker for clusters that are solely defined by the absence of markers. Defsult is TRUE} - -\item{consecutiveOneoff}{Logical. Whether or not to allow one-off splits at consecutive brances. Default is TRUE} +\item{consecutiveOneoff}{Logical. Whether or not to allow one-off splits at +consecutive brances. Default it TRUE} } \value{ A named list with five elements. \itemize{ \item rules - A named list with one `data.frame` for every label. Each -`data.frame` has five columns and gives the set of rules for disinguishing each label. +`data.frame` has five columns and gives the set of rules for disinguishing + each label. \itemize{ \item feature - Feature identifier. - \item direction - Relationship to feature value, -1 if less than, 1 if greater than. + \item direction - Relationship to feature value, -1 if less than, 1 if + greater than. \item value - The feature value which defines the decision boundary - \item stat - The performance value returned by the splitting metric for this split. - \item statUsed - Which performance metric was used. "IG" if information gain and "OO" if one-off. - \item level - The level of the tree at which is rule was defined. 1 is the level of the first split of the tree. + \item stat - The performance value returned by the splitting metric for + this split. + \item statUsed - Which performance metric was used. "IG" if information + gain and "OO" if one-off. + \item level - The level of the tree at which is rule was defined. 1 is the + level of the first split of the tree. } \item dendro - A dendrogram object of the decision tree output - \item prediction - A character vector of label of predictions of the training data using the final model. "MISSING" if label prediction was ambiguous. - \item performance - A named list denoting the training performance of the model. + \item summaryMatrix - A K(labels) by L(features) matrix representation of + the decision rules. Columns denote features and rows denote labels. Non-0 + values denote instances where a feature was used on a given label. Positive + and negative values denote whether the values of the label for that feature + were greater than or less than the decision threshold, respectively. The + magnitude of Non-0 values denote the level at which the feature was used, + where the first split has a magnitude of 1. Note, if reuse_features = TRUE, + only the final usage of a feature for a given label is shown. + \item prediction - A character vector of label of predictions of the + training data using the final model. "MISSING" if label prediction was + ambiguous. + \item performance - A named list denoting the training performance of the + model. \itemize{ \item accuracy - (number correct/number of samples) for the whole set of samples. @@ -63,21 +85,21 @@ Uses decision tree procudure to generate a set of rules for each \examples{ library(M3DExampleData) counts <- M3DExampleData::Mmus_example_list$data -# Subset 500 genes for fast clustering -counts <- as.matrix(counts[1501:2000, ]) -# Cluster genes ans samples each into 10 modules +# subset 100 genes for fast clustering +counts <- as.matrix(counts[1500:2000, ]) +# cluster genes into 10 modules for quick demo cm <- celda_CG(counts = counts, L = 10, K = 5, verbose = FALSE) # Get features matrix and cluster assignments factorized <- factorizeMatrix(counts, cm) features <- factorized$proportions$cell class <- clusters(cm)$z # Generate Decision Tree -decTree <- findMarkers(features, - class, - oneoffMetric = "modified F1", - threshold = 1, - consecutiveOneoff = FALSE) +DecTree <- buildTreeHybrid(features, + class, + oneoffMetric = "modified F1", + threshold = 1, + consecutiveOneoff = FALSE) # Plot dendrogram -plotDendro(decTree) +plotDendro(DecTree) } diff --git a/man/plotDendro.Rd b/man/plotDendro.Rd index 157d38e6..ced5400c 100644 --- a/man/plotDendro.Rd +++ b/man/plotDendro.Rd @@ -5,7 +5,8 @@ \title{Plots dendrogram of `findMarkers` output} \usage{ plotDendro(decisionTree, classLabel = NULL, addSensPrec = FALSE, - leafSize = 24, boxSize = 7, boxColor = "black") + maxFeaturePrint = 4, leafSize = 24, boxSize = 7, + boxColor = "black") } \arguments{ \item{decisionTree}{List object. The output of `celda::findMarkers`.} @@ -16,6 +17,9 @@ the path and rules. If NULL (default), the rules for every cluster is shown.} \item{addSensPrec}{Logical. Print training sensitivities and precisions for each cluster below leaf label? Default is FALSE.} +\item{maxFeaturePrint}{A numeric value. Maximum number of feature IDs to print +at a given node. Default is 4.} + \item{leafSize}{A numeric value. Size of text below each leaf. Default is 24.} \item{boxSize}{A numeric value. Size of rule labels. Default is 7.} From f65a8989c6a2ab89f0e21ba43e270568b9410926 Mon Sep 17 00:00:00 2001 From: zhewa <314100793@qq.com> Date: Wed, 10 Jul 2019 10:23:43 -0400 Subject: [PATCH 033/149] fix example --- R/findMarkers.R | 6 +++--- man/findMarkers.Rd | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/R/findMarkers.R b/R/findMarkers.R index 4c271e79..315a5f73 100644 --- a/R/findMarkers.R +++ b/R/findMarkers.R @@ -6,10 +6,10 @@ #' rules for identifying sets of similar clusters. #' @param features A L(features) by N(samples) numeric matrix. #' @param class A vector of K label assignemnts. +#' @param cellTypes List where each element is a cell type and all the clusters +#' within that cell type (i.e. subtypes). #' @param oneoffMetric A character string. What one-off metric to run, either #' `modified F1` or `pairwise AUC`. -#' @param cellTypes List where each element is a cell type and all the clusters -#' within that cell type (i.e. subtypes). #' @param threshold A numeric value. The threshold for the oneoff metric to use #' between 0 and 1, 0.95 by default. Smaller values will result is more one-off #' splits. @@ -73,7 +73,7 @@ #' features <- factorized$proportions$cell #' class <- clusters(cm)$z #' # Generate Decision Tree -#' DecTree <- buildTreeHybrid(features, +#' DecTree <- findMarkers(features, #' class, #' oneoffMetric = "modified F1", #' threshold = 1, diff --git a/man/findMarkers.Rd b/man/findMarkers.Rd index d778cb3f..113c5ac4 100644 --- a/man/findMarkers.Rd +++ b/man/findMarkers.Rd @@ -13,10 +13,11 @@ findMarkers(features, class, cellTypes, oneoffMetric = c("modified F1", \item{class}{A vector of K label assignemnts.} +\item{cellTypes}{List where each element is a cell type and all the clusters +within that cell type (i.e. subtypes).} + \item{oneoffMetric}{A character string. What one-off metric to run, either -`modified F1` or `pairwise AUC`. -@param cellTypes List where each element is a cell type and all the clusters - within that cell type (i.e. subtypes).} +`modified F1` or `pairwise AUC`.} \item{threshold}{A numeric value. The threshold for the oneoff metric to use between 0 and 1, 0.95 by default. Smaller values will result is more one-off @@ -94,7 +95,7 @@ factorized <- factorizeMatrix(counts, cm) features <- factorized$proportions$cell class <- clusters(cm)$z # Generate Decision Tree -DecTree <- buildTreeHybrid(features, +DecTree <- findMarkers(features, class, oneoffMetric = "modified F1", threshold = 1, From a5cf80493ca1161d34a55f02e440c61bbdb76a7e Mon Sep 17 00:00:00 2001 From: zhewa <314100793@qq.com> Date: Tue, 16 Jul 2019 11:37:45 -0400 Subject: [PATCH 034/149] update findMarkers --- NAMESPACE | 1 + R/findMarkers.R | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index fb123e06..fdf6b708 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -127,6 +127,7 @@ importFrom(gtable,gtable_height) importFrom(gtable,gtable_width) importFrom(matrixStats,logSumExp) importFrom(methods,.hasSlot) +importFrom(methods,hasArg) importFrom(methods,is) importFrom(methods,new) importFrom(pROC,auc) diff --git a/R/findMarkers.R b/R/findMarkers.R index 315a5f73..f2d8cd73 100644 --- a/R/findMarkers.R +++ b/R/findMarkers.R @@ -82,6 +82,7 @@ #' # Plot dendrogram #' plotDendro(DecTree) #' @import magrittr +#' @importFrom methods hasArg #' @export findMarkers <- function(features, class, @@ -111,7 +112,7 @@ findMarkers <- function(features, features <- t(features) # If no detailed cell types are provided - if(!hasArg(cellTypes)){ + if(!methods::hasArg(cellTypes)){ print('Building tree...') From f116e7c666153cb36777382af15490702b65f4ed Mon Sep 17 00:00:00 2001 From: zhewa <314100793@qq.com> Date: Tue, 16 Jul 2019 12:38:38 -0400 Subject: [PATCH 035/149] fix lints --- R/findMarkers.R | 131 ++++++++++++++++++-------------------- R/plotDendro.R | 156 +++++++++++++++++++++++----------------------- man/plotDendro.Rd | 4 +- 3 files changed, 141 insertions(+), 150 deletions(-) diff --git a/R/findMarkers.R b/R/findMarkers.R index f2d8cd73..b4405988 100644 --- a/R/findMarkers.R +++ b/R/findMarkers.R @@ -112,9 +112,9 @@ findMarkers <- function(features, features <- t(features) # If no detailed cell types are provided - if(!methods::hasArg(cellTypes)){ + if (!methods::hasArg(cellTypes)) { - print('Building tree...') + message("Building tree...") # Set class to factor class <- as.factor(class) @@ -133,7 +133,7 @@ findMarkers <- function(features, tree <- .addAlternativeSplit(tree, features, class) } - print('Computing performance metrics...') + message("Computing performance metrics...") # Format tree output for plotting and generate summary statistics DTsummary <- .summarizeTree(tree, features, class) @@ -143,9 +143,9 @@ findMarkers <- function(features, # If detailed cell types are provided # Check that cell types match class labels - if(mean(unlist(cellTypes) %in% unique(class)) != 1) { - stop("Provided cell types do not match class labels. - Please check the 'cellTypes' argument.") + if (mean(unlist(cellTypes) %in% unique(class)) != 1) { + stop("Provided cell types do not match class labels. ", + "Please check the 'cellTypes' argument.") } # Create vector with cell type class labels @@ -167,14 +167,12 @@ findMarkers <- function(features, ) # Create tree for cell types - print('Building tree for all cell types...') + message("Building tree for all cell types...") tree <- .generateTreeList(features, as.factor(newLabels), oneoffMetric, - threshold, reuseFeatures, consecutiveOneoff) - tree <- list( - rules = .mapClass2features(tree, features, + threshold, reuseFeatures, consecutiveOneoff) + tree <- list(rules = .mapClass2features(tree, features, as.factor(newLabels))$rules, - dendro = .convertToDendrogram(tree, as.factor(newLabels)) - ) + dendro = .convertToDendrogram(tree, as.factor(newLabels))) # Store tree's dendrogram in a separate variable dendro <- tree$dendro @@ -183,7 +181,7 @@ findMarkers <- function(features, newTrees <- lapply(largeCellTypes, function(cellType){ # Print current status - print(paste('Building tree for cell type:', cellType)) + message("Building tree for cell type ", cellType) # Remove used features featUse <- colnames(features) @@ -194,19 +192,16 @@ findMarkers <- function(features, # Create new tree newTree <- .generateTreeList(features[newLabels == cellType, featUse], - as.factor(subtypeLabels[ - newLabels == cellType]), - oneoffMetric, threshold, - reuseFeatures, consecutiveOneoff) - newTree <- list( - rules = .mapClass2features(newTree, - features[newLabels == cellType,], - as.factor(subtypeLabels[ - newLabels == cellType]))$rules, + as.factor(subtypeLabels[newLabels == cellType]), + oneoffMetric, threshold, + reuseFeatures, consecutiveOneoff) + newTree <- list(rules = .mapClass2features(newTree, + features[newLabels == cellType, ], + as.factor(subtypeLabels[ + newLabels == cellType]))$rules, dendro = .convertToDendrogram(newTree, - as.factor(subtypeLabels[ - newLabels == cellType])) - ) + as.factor(subtypeLabels[ + newLabels == cellType]))) # Adjust 'rules' table for new tree newTree$rules <- lapply(newTree$rules, function(rules){ @@ -227,7 +222,7 @@ findMarkers <- function(features, addDepth <- maxDepth - attributes(dendro)$height dendro <- dendrapply(dendro, function(node, addDepth) { - if(attributes(node)$height > 1){ + if (attributes(node)$height > 1) { attributes(node)$height <- attributes(node)$height + addDepth + 1 } @@ -236,38 +231,38 @@ findMarkers <- function(features, # Find indices of cell type nodes in tree indices <- lapply(largeCellTypes, - function(cellType) { - # Initialize sub trees, indices string, and flag - dendSub <- dendro - index <- "" - flag <- TRUE - - while (flag) { - # Get the edge with the class of interest - whEdge <- which(unlist(lapply(dendSub, - function(edge) - cellType %in% - attributes(edge)$classLabels))) - - # Add this as a string - index <- paste0(index, "[[", whEdge, "]]") - - # Move to this branch - dendSub <- eval(parse(text = - paste0("dendro", index))) - - # Is this the only class in that branch - flag <- length( - attributes(dendSub)$classLabels) > 1 - } - - return(index) - } + function(cellType) { + # Initialize sub trees, indices string, and flag + dendSub <- dendro + index <- "" + flag <- TRUE + + while (flag) { + # Get the edge with the class of interest + whEdge <- which(unlist(lapply(dendSub, + function(edge) + cellType %in% + attributes(edge)$classLabels))) + + # Add this as a string + index <- paste0(index, "[[", whEdge, "]]") + + # Move to this branch + dendSub <- eval(parse(text = + paste0("dendro", index))) + + # Is this the only class in that branch + flag <- length( + attributes(dendSub)$classLabels) > 1 + } + + return(index) + } ) names(indices) <- largeCellTypes # Add each cell type tree - for(cellType in largeCellTypes){ + for (cellType in largeCellTypes) { # Get current tree cellTypeDendro <- newTrees[[cellType]]$dendro @@ -275,7 +270,7 @@ findMarkers <- function(features, # Adjust labels, member count, and midpoint of nodes dendro <- dendrapply(dendro, function(node){ # Check if in right branch - if(cellType %in% as.character(attributes(node)$classLabels)){ + if (cellType %in% as.character(attributes(node)$classLabels)) { # Replace cell type label with subtype labels attributes(node)$classLabels <- as.character(attributes(node)$classLabels) %>% @@ -300,7 +295,7 @@ findMarkers <- function(features, branchPointLabel <- branchPointAttr$label branchPointStatUsed <- branchPointAttr$statUsed - if(!is.null(branchPointLabel)) { + if (!is.null(branchPointLabel)) { attributes(cellTypeDendro)$label <- branchPointLabel attributes(cellTypeDendro)$statUsed <- branchPointStatUsed } @@ -310,23 +305,19 @@ findMarkers <- function(features, indLoc <- indLoc[length(indLoc)] parentIndexString <- substr(indices[[cellType]], 0, - indLoc-1) + indLoc - 1) parentHeight <- attributes(eval(parse( text = paste0("dendro", parentIndexString))))$height cellTypeHeight <- attributes(cellTypeDendro)$height cellTypeDendro <- dendrapply(cellTypeDendro, - function(node, - parentHeight, - cellTypeHeight) { - if(attributes(node)$height > 1){ - attributes(node)$height <- - parentHeight - 1 - - (cellTypeHeight - - attributes( - node)$height) - } - return(node) - }, parentHeight, cellTypeHeight) + function(node, parentHeight, cellTypeHeight) { + if (attributes(node)$height > 1){ + attributes(node)$height <- + parentHeight - 1 - (cellTypeHeight - attributes( + node)$height) + } + return(node) + }, parentHeight, cellTypeHeight) # Add new tree to original tree eval(parse(text = paste0( @@ -343,7 +334,7 @@ findMarkers <- function(features, tree$dendro <- dendro # Get performance statistics - print('Computing performance statistics...') + message("Computing performance statistics...") perfList <- .getPerformance(tree$rules, features, as.factor(subtypeLabels)) diff --git a/R/plotDendro.R b/R/plotDendro.R index 28e1e580..683574cf 100644 --- a/R/plotDendro.R +++ b/R/plotDendro.R @@ -6,8 +6,8 @@ #' the path and rules. If NULL (default), the rules for every cluster is shown. #' @param addSensPrec Logical. Print training sensitivities and precisions #' for each cluster below leaf label? Default is FALSE. -#' @param maxFeaturePrint A numeric value. Maximum number of feature IDs to print -#' at a given node. Default is 4. +#' @param maxFeaturePrint A numeric value. Maximum number of feature IDs to +#' print at a given node. Default is 4. #' @param leafSize A numeric value. Size of text below each leaf. Default is 24. #' @param boxSize A numeric value. Size of rule labels. Default is 7. #' @param boxColor A character value. Color of rule labels. Default is `black`. @@ -37,19 +37,19 @@ #' @importFrom dendextend get_nodes_xy get_nodes_attr get_leaves_attr #' @export plotDendro <- function(decisionTree, - classLabel = NULL, - addSensPrec = FALSE, - maxFeaturePrint = 4, - leafSize = 24, - boxSize = 7, - boxColor = "black") { - + classLabel = NULL, + addSensPrec = FALSE, + maxFeaturePrint = 4, + leafSize = 24, + boxSize = 7, + boxColor = "black") { + # Get necessary elements dendro <- decisionTree$dendro - + # Get performance information (training or CV based) performance <- decisionTree$performance - + # Create vector of per class performance perfVec <- paste(performance$sizes, format(round(performance$sensitivity, 2), nsmall = 2), @@ -57,14 +57,14 @@ plotDendro <- function(decisionTree, sep = "\n" ) names(perfVec) <- names(performance$sensitivity) - + # Get dendrogram segments dendSegs <- ggdendro::dendro_data(dendro, type = "rectangle")$segments - + # Get necessary coordinates to add labels to # These will have y > 1 dendSegs <- unique(dendSegs[dendSegs$y > 1, c("x", "y", "yend", "xend")]) - + # Labeled splits will be vertical (x != xend) or # Length 0 (x == xend & y == yend) dendSegsAlt <- dendSegs[ @@ -72,26 +72,26 @@ plotDendro <- function(decisionTree, (dendSegs$x == dendSegs$xend & dendSegs$y == dendSegs$yend), c("x", "xend", "y")] colnames(dendSegsAlt)[1] <- "xalt" - + # Label names will be at nodes, these will # Occur at the end of segments segs <- as.data.frame(dendextend::get_nodes_xy(dendro)) colnames(segs) <- c("xend", "yend") - + # Add labels to nodes segs$label <- gsub(";", "\n", dendextend::get_nodes_attr(dendro, "label")) segs$label <- sapply(segs$label, function(lab, maxFeaturePrint) { loc <- gregexpr("\n", lab)[[1]][maxFeaturePrint] - if(!is.na(loc)) { - lab <- substr(lab, 1, loc-1) + if (!is.na(loc)) { + lab <- substr(lab, 1, loc - 1) } return(lab) }, maxFeaturePrint) - + # Subset for max - + segs$statUsed <- dendextend::get_nodes_attr(dendro, "statUsed") - + # If highlighting a class label, remove non-class specific rules if (!is.null(classLabel)) { if (!classLabel %in% names(decisionTree$rules)) { @@ -102,40 +102,40 @@ plotDendro <- function(decisionTree, keepLabel[is.na(keepLabel)] <- FALSE segs$label[!keepLabel] <- NA } - + # Remove non-labelled nodes & # leaf nodes (yend == 0) segs <- segs[!is.na(segs$label) & segs$yend != 0, ] - + # Merge to full set of coordinates dendSegsLabelled <- merge(dendSegs, segs) - + # Remove duplicated labels dendSegsLabelled <- dendSegsLabelled[order(dendSegsLabelled$y, - decreasing = T), ] + decreasing = T), ] dendSegsLabelled <- dendSegsLabelled[ !duplicated(dendSegsLabelled[, - c("xend", "x", "yend", "label", "statUsed")]), ] - + c("xend", "x", "yend", "label", "statUsed")]), ] + # Merge with alternative x-coordinates for alternative split dendSegsLabelled <- merge(dendSegsLabelled, dendSegsAlt) - + # Order by height and coordinates dendSegsLabelled <- dendSegsLabelled[order(dendSegsLabelled$x), ] - + # Find information gain splits igSplits <- dendSegsLabelled$statUsed == "IG" & !duplicated(dendSegsLabelled[, c("xalt", "y")]) - + # Set xend for IG splits dendSegsLabelled$xend[igSplits] <- dendSegsLabelled$xalt[igSplits] - + # Set y for non-IG splits dendSegsLabelled$y[!igSplits] <- dendSegsLabelled$y[!igSplits] - 0.2 - + # Get index of leaf labels leafLabels <- dendextend::get_leaves_attr(dendro, "label") - + # Add sensitivity and precision measurements if (addSensPrec) { leafLabels <- paste(leafLabels, perfVec[leafLabels], sep = "\n") @@ -147,61 +147,61 @@ plotDendro <- function(decisionTree, leafHJust <- 1 leafVJust <- 0.5 } - + # Create plot of dendrogram suppressMessages(dendroP <- ggdendro::ggdendrogram(dendro) + - ggplot2::geom_label( - data = dendSegsLabelled, - ggplot2::aes(x = xend, y = y, label = label), - size = boxSize, - label.size = 1, - fontface = "bold", - vjust = 1, - nudge_y = 0.1, - color = boxColor) + - ggplot2::theme_bw() + - ggplot2::scale_x_reverse(breaks = seq(length(leafLabels)), - label = leafLabels) + - ggplot2::scale_y_continuous(expand = c(0, 0)) + - ggplot2::theme( - panel.grid.major.y = ggplot2::element_blank(), - legend.position = "none", - panel.grid.minor.y = ggplot2::element_blank(), - panel.grid.minor.x = ggplot2::element_blank(), - panel.grid.major.x = ggplot2::element_blank(), - panel.border = ggplot2::element_blank(), - axis.title = ggplot2::element_blank(), - axis.ticks = ggplot2::element_blank(), - axis.text.x = ggplot2::element_text( - hjust = leafHJust, - angle = leafAngle, - size = leafSize, - family = "mono", - vjust = leafVJust), - axis.text.y = ggplot2::element_blank() - )) - + ggplot2::geom_label( + data = dendSegsLabelled, + ggplot2::aes(x = xend, y = y, label = label), + size = boxSize, + label.size = 1, + fontface = "bold", + vjust = 1, + nudge_y = 0.1, + color = boxColor) + + ggplot2::theme_bw() + + ggplot2::scale_x_reverse(breaks = seq(length(leafLabels)), + label = leafLabels) + + ggplot2::scale_y_continuous(expand = c(0, 0)) + + ggplot2::theme( + panel.grid.major.y = ggplot2::element_blank(), + legend.position = "none", + panel.grid.minor.y = ggplot2::element_blank(), + panel.grid.minor.x = ggplot2::element_blank(), + panel.grid.major.x = ggplot2::element_blank(), + panel.border = ggplot2::element_blank(), + axis.title = ggplot2::element_blank(), + axis.ticks = ggplot2::element_blank(), + axis.text.x = ggplot2::element_text( + hjust = leafHJust, + angle = leafAngle, + size = leafSize, + family = "mono", + vjust = leafVJust), + axis.text.y = ggplot2::element_blank() + )) + # Increase line width slightly for aesthetic purposes dendroP$layers[[2]]$aes_params$size <- 1.3 - + return(dendroP) } # Function to reformat the dendrogram to draw path to a specific class .highlightClassLabel <- function(dendro, classLabel) { - + # Reorder dendrogram flag <- TRUE bIndexString <- "" - + # Get branch branch <- eval(parse(text = paste0("dendro", bIndexString))) - + while (flag) { - + # Get attributes att <- attributes(branch) - + # Get split with the label of interest labList <- lapply(branch, function(split) attributes(split)$classLabels) wSplit <- which(unlist(lapply( @@ -209,29 +209,29 @@ plotDendro <- function(decisionTree, function(vec) { classLabel %in% vec }))) - + # Keep labels for this branch branch <- lapply(branch, function(edge) { attributes(edge)$keepLabel <- TRUE return(edge) }) - + # Make a dendrogram class again class(branch) <- "dendrogram" attributes(branch) <- att - + # Add branch to dendro eval(parse(text = paste0("dendro", bIndexString, "<- branch"))) - + # Create new bIndexString bIndexString <- paste0(bIndexString, "[[", wSplit, "]]") - + # Get branch branch <- eval(parse(text = paste0("dendro", bIndexString))) - + # Add flag flag <- attributes(branch)$members > 1 } - + return(dendro) } \ No newline at end of file diff --git a/man/plotDendro.Rd b/man/plotDendro.Rd index ced5400c..efa2e14b 100644 --- a/man/plotDendro.Rd +++ b/man/plotDendro.Rd @@ -17,8 +17,8 @@ the path and rules. If NULL (default), the rules for every cluster is shown.} \item{addSensPrec}{Logical. Print training sensitivities and precisions for each cluster below leaf label? Default is FALSE.} -\item{maxFeaturePrint}{A numeric value. Maximum number of feature IDs to print -at a given node. Default is 4.} +\item{maxFeaturePrint}{A numeric value. Maximum number of feature IDs to +print at a given node. Default is 4.} \item{leafSize}{A numeric value. Size of text below each leaf. Default is 24.} From 6a3d54b3fa0976f9e4ffa1c419aedc53ba3237fc Mon Sep 17 00:00:00 2001 From: zhewa <314100793@qq.com> Date: Tue, 16 Jul 2019 13:15:46 -0400 Subject: [PATCH 036/149] version bump --- DESCRIPTION | 13 +++++---- inst/NEWS | 4 +++ vignettes/FindMarkers-analysis.Rmd | 44 +++++++++++++++--------------- 3 files changed, 34 insertions(+), 27 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 84314712..74547a80 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,10 +1,13 @@ Package: celda Title: CEllular Latent Dirichlet Allocation -Version: 1.1.5 -Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), - person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), - person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")), - person("Zhe", "Wang", email="zhe@bu.edu", role = c("aut"))) +Version: 1.1.6 +Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", + role = c("aut", "cre")), + person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), + person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")), + person("Shiyi", "Yang", email="syyang@bu.edu", role = c("aut")), + person("Eric", "Reed", email="reeder@bu.edu", role = c("aut")), + person("Zhe", "Wang", email="zhe@bu.edu", role = c("aut"))) Description: celda leverages Bayesian hierarchical modeling to cluster genes, cells, or both simultaneously from single cell sequencing data. Depends: diff --git a/inst/NEWS b/inst/NEWS index d5795a41..d52d5c91 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,7 @@ +Changes in version 1.1.6 (2019-07-16): + + o Add multiclass decision tree + Changes in version 1.1.4 (2019-05-28): o Add Alternate headings support for plotDimReduceFeature diff --git a/vignettes/FindMarkers-analysis.Rmd b/vignettes/FindMarkers-analysis.Rmd index 08f1829d..92943a12 100644 --- a/vignettes/FindMarkers-analysis.Rmd +++ b/vignettes/FindMarkers-analysis.Rmd @@ -87,12 +87,12 @@ class <- clusters(cm)$z ```{r, eval = TRUE, message = FALSE} DecTree <- findMarkers(features, - class, - oneoffMetric = "modified F1", - threshold = 0.95, - reuseFeatures = FALSE, - altSplit = TRUE, - consecutiveOneoff = FALSE) + class, + oneoffMetric = "modified F1", + threshold = 0.95, + reuseFeatures = FALSE, + altSplit = TRUE, + consecutiveOneoff = FALSE) ``` ## `findMarkers` output @@ -131,11 +131,11 @@ The `findMarkers` output is a named list of four elements ```{r, eval = TRUE, message = FALSE} plotDendro(DecTree, - classLabel = NULL, - addSensPrec = TRUE, - leafSize = 24, - boxSize = 7, - boxColor = "black") + classLabel = NULL, + addSensPrec = TRUE, + leafSize = 24, + boxSize = 7, + boxColor = "black") ``` In the plot, the feature(s) used for splits determined by the one-off metric are printed above the cluster labels for which they are markers. Alternatively, the features used for the balanced splits are centered above the two sets of clusters defined by that split. The up-regulated set of clusters for a balanced split are one the right side of that split. The size, sensitivitie and precision of each class are printed below the leaf labels, respectively. @@ -145,11 +145,11 @@ In `plotDendro` the **classLabel** argument may used to only print the sequence ```{r, eval = TRUE, message = FALSE} plotDendro(DecTree, - classLabel = "1", - addSensPrec = TRUE, - leafSize = 15, - boxSize = 7, - boxColor = "black") + classLabel = "1", + addSensPrec = TRUE, + leafSize = 15, + boxSize = 7, + boxColor = "black") ``` # Get label estimates from features matrix @@ -168,12 +168,12 @@ If you have a priori understanding of sub-groups of your cluster labels, you can # Run with a hierarchichal split cellTypes <- list(metaLabel = c("4", "5", "1")) DecTreeMeta <- findMarkers(features, - class, - cellTypes, - oneoffMetric = "modified F1", - threshold = 1, - reuseFeatures = F, - consecutiveOneoff = FALSE) + class, + cellTypes, + oneoffMetric = "modified F1", + threshold = 1, + reuseFeatures = F, + consecutiveOneoff = FALSE) plotDendro(DecTreeMeta) ``` From 141b1833e7edc18dc3a55bf80519aa31c413623e Mon Sep 17 00:00:00 2001 From: Irisapo Date: Mon, 5 Aug 2019 11:21:48 -0400 Subject: [PATCH 037/149] fix tsne plot clusterLabel bug --- R/plot_dr.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index 535b3231..cb4326df 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -415,7 +415,8 @@ plotDimReduceCluster <- function(dim1, ggplot2::guide_legend(override.aes = list(size = 1))) if (labelClusters == TRUE) { - centroidList <- lapply(seq(length(unique(cluster))), function(x) { + #centroidList <- lapply(seq(length(unique(cluster))), function(x) { + centroidList <- lapply(unique(cluster), function(x) { df.sub <- df[df$Cluster == x, ] median.1 <- stats::median(df.sub$Dimension_1) median.2 <- stats::median(df.sub$Dimension_2) From 944c83acb7d3a75533b11c0e58fdd1b8daf019fa Mon Sep 17 00:00:00 2001 From: Irisapo Date: Mon, 5 Aug 2019 11:35:23 -0400 Subject: [PATCH 038/149] decontx vignette minor modification --- vignettes/DecontX-analysis.Rmd | 1 - 1 file changed, 1 deletion(-) diff --git a/vignettes/DecontX-analysis.Rmd b/vignettes/DecontX-analysis.Rmd index 4b116c25..882b911d 100644 --- a/vignettes/DecontX-analysis.Rmd +++ b/vignettes/DecontX-analysis.Rmd @@ -14,7 +14,6 @@ vignette: > # Introduction DecontX is a Bayesian hierarchical model to estimate and remove cross-contamination from ambient RNA in single-cell RNA-seq count data generated from droplet-based sequencing devices. DecontX will take the count matrix with/without the cell labels and estimate the contamination level and deliver a decontaminted count matrix for downstream analysis. ->>>>>>> campbio/master In this vignette we will demonstrate how to use decontX to estimate and remove contamination. From c13cbc671b14eaa2ae0488487b9926e5d4c878a1 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Mon, 5 Aug 2019 11:45:27 -0400 Subject: [PATCH 039/149] deocntx vignette minor modification (upper/lower case) --- vignettes/DecontX-analysis.Rmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vignettes/DecontX-analysis.Rmd b/vignettes/DecontX-analysis.Rmd index 882b911d..9d7061a0 100644 --- a/vignettes/DecontX-analysis.Rmd +++ b/vignettes/DecontX-analysis.Rmd @@ -1,12 +1,12 @@ --- -title: "Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with decontX" +title: "Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with DecontX" author: "Shiyi Yang, Sean Corbett, Yusuke Koga, Zhe Wang, W. Evan Johnson, Masanao Yajima, Joshua D. Campbell" date: "`r Sys.Date()`" output: BiocStyle::html_document: toc: true vignette: > - %\VignetteIndexEntry{Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with decontX} + %\VignetteIndexEntry{Estimate and remove cross-contamination from ambient RNA for scRNA-seq data with DecontX} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -15,7 +15,7 @@ vignette: > DecontX is a Bayesian hierarchical model to estimate and remove cross-contamination from ambient RNA in single-cell RNA-seq count data generated from droplet-based sequencing devices. DecontX will take the count matrix with/without the cell labels and estimate the contamination level and deliver a decontaminted count matrix for downstream analysis. -In this vignette we will demonstrate how to use decontX to estimate and remove contamination. +In this vignette we will demonstrate how to use DecontX to estimate and remove contamination. # Installation @@ -40,7 +40,7 @@ To see the latest updates and releases or to post a bug, see our GitHub page at Many functions in *celda* make use of stochastic algorithms or procedures which require the use of random number generator (RNG) for simulation or sampling. To maintain reproducibility, all these functions use a **default seed of 12345** to make sure same results are generated each time one of these functions is called. Explicitly setting the `seed` arguments is needed for greater control and randomness. # Generation of a cross-contaminated dataset -decontX will take a matrix of counts (referred as observed counts) where each row is a feature, each column is a cell, and each entry in the matrix is the number of counts of each feature in each cell. To illustrate the utility of DecontX, we will apply it to a simulated dataset. +DecontX will take a matrix of counts (referred as observed counts) where each row is a feature, each column is a cell, and each entry in the matrix is the number of counts of each feature in each cell. To illustrate the utility of DecontX, we will apply it to a simulated dataset. In the function `simulateContaminatedMatrix`, the K parameter designates the number of cell clusters, the C parameter determines the number of cells, the G parameter determines the number of genes in the simulated dataset. From 916122c91b1985288f8904825045304e027744cb Mon Sep 17 00:00:00 2001 From: Irisapo Date: Tue, 6 Aug 2019 19:49:43 -0400 Subject: [PATCH 040/149] semiheatmap able to order group by colGroupOrder and rowGroupOrder --- R/celda_heatmap.R | 4 ++++ R/semi_pheatmap.R | 27 +++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/R/celda_heatmap.R b/R/celda_heatmap.R index e3d1e7f4..7fc5da28 100644 --- a/R/celda_heatmap.R +++ b/R/celda_heatmap.R @@ -79,6 +79,8 @@ plotHeatmap <- function(counts, z = NULL, y = NULL, + rowGroupOrder = NULL, + colGroupOrder = NULL, scaleRow = scale, trim = c(-2, 2), featureIx = NULL, @@ -276,6 +278,8 @@ plotHeatmap <- function(counts, treeHeightCol = treeheightCell, rowLabel = y, colLabel = z, + rowGroupOrder = rowGroupOrder, + colGroupOrder = colGroupOrder, silent = TRUE, ...) diff --git a/R/semi_pheatmap.R b/R/semi_pheatmap.R index d977a828..f1426c12 100755 --- a/R/semi_pheatmap.R +++ b/R/semi_pheatmap.R @@ -1552,6 +1552,8 @@ semiPheatmap <- function(mat, silent = FALSE, rowLabel, colLabel, + rowGroupOrder = NULL, + colGroupOrder = NULL, ...) { # Set labels @@ -1620,7 +1622,8 @@ semiPheatmap <- function(mat, if (is.null(rowLabel)) { rowLabel <- rep(1, nrow(mat)) } else { - o <- order(rowLabel) + #o <- order(rowLabel) + o <- .Order(labels=rowLabel, groupOrder=rowGroupOrder) mat <- mat[o, , drop = FALSE] fmat <- fmat[o, , drop = FALSE] rowLabel <- rowLabel[o] @@ -1654,7 +1657,8 @@ semiPheatmap <- function(mat, if (is.null(colLabel)) { colLabel <- rep(1, ncol(mat)) } else { - o <- order(colLabel) + #o <- order(colLabel) + o <- .Order(labels=colLabel, groupOrder=colGroupOrder) mat <- mat[, o, drop = FALSE] fmat <- fmat[, o, drop = FALSE] colLabel <- colLabel[o] @@ -1781,3 +1785,22 @@ semiPheatmap <- function(mat, treeCol = treeCol, gtable = gt)) } + + + + +# order function that order the row/column labels based on the order of the group priority +# return value is a vector of the ordered index +# labels is a vector of any non-zero length +# groupOrder, a column named dataframe/matrix with the "groupName" column storing the group name and the "groupIndex" storing the group priority +.Order = function(labels, groupOrder=NULL){ + if (is.null(groupOrder)) { + return(order(labels)) + } else { + # Throw error is length(unique(labels)) != nrow(groupOrder) + + olabels = plyr::mapvalues(x=labels, from=groupOrder[,"groupName"], to=groupOrder[,"groupIndex"]) + olabels = as.integer(olabels) # Make sure the olabels is integer for order() function + return(order(olabels)) + } +} From 94074a09fef818c7b6fc6bb9dd7ed222e5e16c92 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Wed, 21 Aug 2019 21:07:09 -0400 Subject: [PATCH 041/149] Fixed error in tSNE creation for celda_G where cells where being shuffled, but not put back in the right order before returning the object --- R/celda_G.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/R/celda_G.R b/R/celda_G.R index b94a7386..07e03746 100755 --- a/R/celda_G.R +++ b/R/celda_G.R @@ -608,9 +608,11 @@ setMethod("celdaTsne", min.cluster.size, modules) res = calculateTsne(prepared.count.info$norm, do.pca=FALSE, perplexity=perplexity, max.iter=max.iter, seed=seed) - rownames(res) = colnames(counts) - colnames(res) = c("tsne_1", "tsne_2") - return(res) + final = matrix(NA, nrow=ncol(counts), ncol=2) + final[prepared.count.info$cell.ix, ] = res + rownames(final) = colnames(counts) + colnames(final) = c("tsne_1", "tsne_2") + return(final) }) From 2a2e9249aa0247f1370b569fb612818ebe0db0d3 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Wed, 21 Aug 2019 21:37:18 -0400 Subject: [PATCH 042/149] Changed all tSNE/UMAP functions to not subsample cells by default --- DESCRIPTION | 2 +- R/celda_C.R | 18 +++++++++++------- R/celda_CG.R | 16 ++++++++++------ R/celda_G.R | 22 ++++++++++++---------- man/celdaTsne-celda_C-method.Rd | 4 ++-- man/celdaTsne-celda_CG-method.Rd | 4 ++-- man/celdaTsne-celda_G-method.Rd | 4 ++-- man/celdaUmap-celda_C-method.Rd | 4 ++-- man/celdaUmap-celda_CG-method.Rd | 4 ++-- man/celdaUmap-celda_G-method.Rd | 4 ++-- 10 files changed, 46 insertions(+), 36 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 511fdb56..6803a3e5 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -49,6 +49,6 @@ VignetteBuilder: knitr License: MIT Encoding: UTF-8 LazyData: true -RoxygenNote: 6.1.0 +RoxygenNote: 6.1.1 BugReports: https://github.com/definitelysean/celda/issues biocViews: SingleCell, GeneExpression, Clustering, Sequencing, Bayesian diff --git a/R/celda_C.R b/R/celda_C.R index fcd3cc6a..1be0b8cb 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -599,7 +599,7 @@ setMethod("celdaHeatmap", #' #' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`. #' @param celda.mod Celda object of class `celda_C`. -#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000. +#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. If NULL, no subsampling will be performed. Default NULL. #' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. #' @param initial.dims Integer. PCA will be used to reduce the dimentionality of the dataset. The top 'initial.dims' principal components will be used for tSNE. Default 20. #' @param perplexity Numeric. Perplexity parameter for tSNE. Default 20. @@ -613,7 +613,7 @@ setMethod("celdaHeatmap", #' @export setMethod("celdaTsne", signature(celda.mod = "celda_C"), - function(counts, celda.mod, max.cells=25000, min.cluster.size=100, + function(counts, celda.mod, max.cells=NULL, min.cluster.size=100, initial.dims=20, modules=NULL, perplexity=20, max.iter=2500, seed=12345, ...) { @@ -636,7 +636,7 @@ setMethod("celdaTsne", #' #' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`. #' @param celda.mod Celda object of class `celda_C`. -#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000. +#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. If NULL, no subsampling will be performed. Default NULL. #' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. #' @param initial.dims Integer. PCA will be used to reduce the dimentionality of the dataset. The top 'initial.dims' principal components will be used for tSNE. Default 20. #' @param perplexity Numeric. Perplexity parameter for tSNE. Default 20. @@ -650,7 +650,7 @@ setMethod("celdaTsne", #' @export setMethod("celdaUmap", signature(celda.mod = "celda_C"), - function(counts, celda.mod, max.cells=25000, min.cluster.size=100, + function(counts, celda.mod, max.cells=NULL, min.cluster.size=100, modules=NULL, umap.config=umap::umap.defaults) { prepared.count.info = prepareCountsForDimReduction.celda_C(counts, celda.mod, max.cells, min.cluster.size, modules) @@ -669,12 +669,16 @@ prepareCountsForDimReduction.celda_C = function(counts, celda.mod, max.cells=250 compareCountMatrix(counts, celda.mod) ## Checking if max.cells and min.cluster.size will work - if((max.cells < ncol(counts)) & (max.cells / min.cluster.size < celda.mod@params$K)) { - stop(paste0("Cannot distribute ", max.cells, " cells among ", + if(!is.null(max.cells)) { + if((max.cells < ncol(counts)) & (max.cells / min.cluster.size < celda.mod@params$K)) { + stop(paste0("Cannot distribute ", max.cells, " cells among ", celda.mod@params$K, " clusters while maintaining a minumum of ", min.cluster.size, " cells per cluster. Try increasing 'max.cells' or decreasing 'min.cluster.size'.")) - } + } + } else { + max.cells = ncol(counts) + } ## Select a subset of cells to sample if greater than 'max.cells' total.cells.to.remove = ncol(counts) - max.cells diff --git a/R/celda_CG.R b/R/celda_CG.R index 5c0964bd..69ca2a0a 100755 --- a/R/celda_CG.R +++ b/R/celda_CG.R @@ -721,7 +721,7 @@ setMethod("celdaHeatmap", #' #' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`. #' @param celda.mod Celda object of class `celda_CG`. -#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000. +#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. If NULL, no subsampling will be performed. Default NULL. #' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. #' @param modules Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL. #' @param perplexity Numeric. Perplexity parameter for tSNE. Default 20. @@ -735,7 +735,7 @@ setMethod("celdaHeatmap", #' @export setMethod("celdaTsne", signature(celda.mod = "celda_CG"), - function(counts, celda.mod, max.cells=25000, min.cluster.size=100, + function(counts, celda.mod, max.cells=NULL, min.cluster.size=100, initial.dims=20, modules=NULL, perplexity=20, max.iter=2500, seed=12345, ...) { @@ -757,7 +757,7 @@ setMethod("celdaTsne", #' #' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`. #' @param celda.mod Celda object of class `celda_CG`. -#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000. +#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. If NULL, no subsampling will be performed. Default NULL. #' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. #' @param modules Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL. #' @param umap.config Object of class `umap.config`. Configures parameters for umap. Default `umap::umap.defaults` @@ -768,7 +768,7 @@ setMethod("celdaTsne", #' @export setMethod("celdaUmap", signature(celda.mod = "celda_CG"), - function(counts, celda.mod, max.cells=25000, min.cluster.size=100, + function(counts, celda.mod, max.cells=NULL, min.cluster.size=100, modules=NULL, umap.config=umap::umap.defaults) { prepared.count.info = prepareCountsForDimReduction.celda_CG(counts, celda.mod, max.cells, min.cluster.size, @@ -783,16 +783,20 @@ setMethod("celdaUmap", }) -prepareCountsForDimReduction.celda_CG = function(counts, celda.mod, max.cells=25000, +prepareCountsForDimReduction.celda_CG = function(counts, celda.mod, max.cells=NULL, min.cluster.size=100, initial.dims=20, modules=NULL, ...) { ## Checking if max.cells and min.cluster.size will work + if(!is.null(max.cells)) { if((max.cells < ncol(counts)) & (max.cells / min.cluster.size < celda.mod@params$K)) { stop(paste0("Cannot distribute ", max.cells, " cells among ", celda.mod@params$K, " clusters while maintaining a minumum of ", min.cluster.size, " cells per cluster. Try increasing 'max.cells' or decreasing 'min.cluster.size'.")) } - + } else { + max.cells = ncol(counts) + } + fm = factorizeMatrix(counts=counts, celda.mod=celda.mod, type="counts") modules.to.use = 1:nrow(fm$counts$cell) diff --git a/R/celda_G.R b/R/celda_G.R index 07e03746..59c90fbd 100755 --- a/R/celda_G.R +++ b/R/celda_G.R @@ -586,7 +586,7 @@ setMethod("celdaHeatmap", #' #' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`. #' @param celda.mod Celda object of class `celda_G`. -#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(conts) > max.cells. Larger numbers of cells requires more memory. Default 10000. +#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. If NULL, no subsampling will be performed. Default NULL. #' @param modules Integer vector. Determines which feature modules to use for tSNE. If NULL, all modules will be used. Default NULL. #' @param perplexity Numeric. Perplexity parameter for tSNE. Default 20. #' @param max.iter Integer. Maximum number of iterations in tSNE generation. Default 2500. @@ -599,7 +599,7 @@ setMethod("celdaHeatmap", #' @export setMethod("celdaTsne", signature(celda.mod = "celda_G"), - function(counts, celda.mod, max.cells=25000, min.cluster.size=100, + function(counts, celda.mod, max.cells=NULL, min.cluster.size=100, initial.dims=20, modules=NULL, perplexity=20, max.iter=2500, seed=12345, ...) { @@ -621,7 +621,7 @@ setMethod("celdaTsne", #' #' @param counts Integer matrix. Rows represent features and columns represent cells. This matrix should be the same as the one used to generate `celda.mod`. #' @param celda.mod Celda object of class `celda_CG`. -#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000. +#' @param max.cells Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. If NULL, no subsampling will be performed. Default NULL. #' @param min.cluster.size Integer. Do not subsample cell clusters below this threshold. Default 100. #' @param modules Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL. #' @param umap.config Object of class `umap.config`. Configures parameters for umap. Default `umap::umap.defaults` @@ -632,7 +632,7 @@ setMethod("celdaTsne", #' @export setMethod("celdaUmap", signature(celda.mod = "celda_G"), - function(counts, celda.mod, max.cells=25000, min.cluster.size=100, + function(counts, celda.mod, max.cells=NULL, min.cluster.size=100, modules=NULL, umap.config=umap::umap.defaults) { prepared.count.info = prepareCountsForDimReduction.celda_G(counts, celda.mod, max.cells, min.cluster.size, @@ -646,11 +646,14 @@ setMethod("celdaUmap", }) -prepareCountsForDimReduction.celda_G = function(counts, celda.mod, max.cells=25000, min.cluster.size=100, +prepareCountsForDimReduction.celda_G = function(counts, celda.mod, max.cells=NULL, min.cluster.size=100, modules=NULL) { - if(max.cells > ncol(counts)) { - max.cells = ncol(counts) - } + if(is.null(max.cells) || max.cells > ncol(counts)) { + max.cells = ncol(counts) + cell.ix = 1:ncol(counts) + } else { + cell.ix = sample(1:ncol(counts), max.cells) + } fm = factorizeMatrix(counts=counts, celda.mod=celda.mod, type="counts") @@ -661,10 +664,9 @@ prepareCountsForDimReduction.celda_G = function(counts, celda.mod, max.cells=250 stop("'modules' must be a vector of numbers between 1 and ", modules.to.use, ".") } - modules.to.use = modules + modules.to.use = modules } - cell.ix = sample(1:ncol(counts), max.cells) norm = t(normalizeCounts(fm$counts$cell[modules.to.use,cell.ix], normalize="proportion", transformation.fun=sqrt)) diff --git a/man/celdaTsne-celda_C-method.Rd b/man/celdaTsne-celda_C-method.Rd index 16dd1f34..b447d259 100644 --- a/man/celdaTsne-celda_C-method.Rd +++ b/man/celdaTsne-celda_C-method.Rd @@ -5,7 +5,7 @@ \alias{celdaTsne,celda_C-method} \title{tSNE for celda_C} \usage{ -\S4method{celdaTsne}{celda_C}(counts, celda.mod, max.cells = 25000, +\S4method{celdaTsne}{celda_C}(counts, celda.mod, max.cells = NULL, min.cluster.size = 100, initial.dims = 20, modules = NULL, perplexity = 20, max.iter = 2500, seed = 12345, ...) } @@ -14,7 +14,7 @@ \item{celda.mod}{Celda object of class `celda_C`.} -\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.} +\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. If NULL, no subsampling will be performed. Default NULL.} \item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.} diff --git a/man/celdaTsne-celda_CG-method.Rd b/man/celdaTsne-celda_CG-method.Rd index bfc0df17..d176aa4e 100755 --- a/man/celdaTsne-celda_CG-method.Rd +++ b/man/celdaTsne-celda_CG-method.Rd @@ -5,7 +5,7 @@ \alias{celdaTsne,celda_CG-method} \title{tSNE for celda_CG} \usage{ -\S4method{celdaTsne}{celda_CG}(counts, celda.mod, max.cells = 25000, +\S4method{celdaTsne}{celda_CG}(counts, celda.mod, max.cells = NULL, min.cluster.size = 100, initial.dims = 20, modules = NULL, perplexity = 20, max.iter = 2500, seed = 12345, ...) } @@ -14,7 +14,7 @@ \item{celda.mod}{Celda object of class `celda_CG`.} -\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.} +\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. If NULL, no subsampling will be performed. Default NULL.} \item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.} diff --git a/man/celdaTsne-celda_G-method.Rd b/man/celdaTsne-celda_G-method.Rd index f4ee0c02..bb0403ad 100755 --- a/man/celdaTsne-celda_G-method.Rd +++ b/man/celdaTsne-celda_G-method.Rd @@ -5,7 +5,7 @@ \alias{celdaTsne,celda_G-method} \title{tSNE for celda_G} \usage{ -\S4method{celdaTsne}{celda_G}(counts, celda.mod, max.cells = 25000, +\S4method{celdaTsne}{celda_G}(counts, celda.mod, max.cells = NULL, min.cluster.size = 100, initial.dims = 20, modules = NULL, perplexity = 20, max.iter = 2500, seed = 12345, ...) } @@ -14,7 +14,7 @@ \item{celda.mod}{Celda object of class `celda_G`.} -\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(conts) > max.cells. Larger numbers of cells requires more memory. Default 10000.} +\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. If NULL, no subsampling will be performed. Default NULL.} \item{modules}{Integer vector. Determines which feature modules to use for tSNE. If NULL, all modules will be used. Default NULL.} diff --git a/man/celdaUmap-celda_C-method.Rd b/man/celdaUmap-celda_C-method.Rd index 208c4e97..3d31dc69 100755 --- a/man/celdaUmap-celda_C-method.Rd +++ b/man/celdaUmap-celda_C-method.Rd @@ -5,7 +5,7 @@ \alias{celdaUmap,celda_C-method} \title{umap for celda_C} \usage{ -\S4method{celdaUmap}{celda_C}(counts, celda.mod, max.cells = 25000, +\S4method{celdaUmap}{celda_C}(counts, celda.mod, max.cells = NULL, min.cluster.size = 100, modules = NULL, umap.config = umap::umap.defaults) } @@ -14,7 +14,7 @@ \item{celda.mod}{Celda object of class `celda_C`.} -\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.} +\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. If NULL, no subsampling will be performed. Default NULL.} \item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.} diff --git a/man/celdaUmap-celda_CG-method.Rd b/man/celdaUmap-celda_CG-method.Rd index 7e5464a4..992cc2eb 100755 --- a/man/celdaUmap-celda_CG-method.Rd +++ b/man/celdaUmap-celda_CG-method.Rd @@ -5,7 +5,7 @@ \alias{celdaUmap,celda_CG-method} \title{umap for celda_CG} \usage{ -\S4method{celdaUmap}{celda_CG}(counts, celda.mod, max.cells = 25000, +\S4method{celdaUmap}{celda_CG}(counts, celda.mod, max.cells = NULL, min.cluster.size = 100, modules = NULL, umap.config = umap::umap.defaults) } @@ -14,7 +14,7 @@ \item{celda.mod}{Celda object of class `celda_CG`.} -\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.} +\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. If NULL, no subsampling will be performed. Default NULL.} \item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.} diff --git a/man/celdaUmap-celda_G-method.Rd b/man/celdaUmap-celda_G-method.Rd index 3304ec05..b635c671 100755 --- a/man/celdaUmap-celda_G-method.Rd +++ b/man/celdaUmap-celda_G-method.Rd @@ -5,7 +5,7 @@ \alias{celdaUmap,celda_G-method} \title{umap for celda_G} \usage{ -\S4method{celdaUmap}{celda_G}(counts, celda.mod, max.cells = 25000, +\S4method{celdaUmap}{celda_G}(counts, celda.mod, max.cells = NULL, min.cluster.size = 100, modules = NULL, umap.config = umap::umap.defaults) } @@ -14,7 +14,7 @@ \item{celda.mod}{Celda object of class `celda_CG`.} -\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. Default 25000.} +\item{max.cells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > max.cells. Larger numbers of cells requires more memory. If NULL, no subsampling will be performed. Default NULL.} \item{min.cluster.size}{Integer. Do not subsample cell clusters below this threshold. Default 100.} From dc67065662e7cbe0e070fc38432102d5a76eb7e3 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Thu, 22 Aug 2019 09:30:17 -0400 Subject: [PATCH 043/149] Slightly tweaked descrption --- DESCRIPTION | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 74547a80..0afaf469 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,8 +8,7 @@ Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", person("Shiyi", "Yang", email="syyang@bu.edu", role = c("aut")), person("Eric", "Reed", email="reeder@bu.edu", role = c("aut")), person("Zhe", "Wang", email="zhe@bu.edu", role = c("aut"))) -Description: celda leverages Bayesian hierarchical modeling to cluster genes, - cells, or both simultaneously from single cell sequencing data. +Description: celda is a Bayesian hierarchical model that can co-cluster features and cells in single cell sequencing data. Depends: R (>= 3.6) VignetteBuilder: knitr From a7cd4582f7eda672510f8a81c896a83ce70bead1 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Thu, 22 Aug 2019 09:30:51 -0400 Subject: [PATCH 044/149] Fixed old reference from 'max.cells' to 'maxCells' --- R/celda_C.R | 4 ++-- R/celda_CG.R | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/celda_C.R b/R/celda_C.R index af390ce6..9f578edd 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -1211,7 +1211,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), compareCountMatrix(counts, celdaMod) ## Checking if maxCells and minClusterSize will work - if(!is.null(max.cells)) { + if(!is.null(maxCells)) { if ((maxCells < ncol(counts)) & (maxCells / minClusterSize < params(celdaMod)$K)) { @@ -1225,7 +1225,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), " 'minClusterSize'.") } } else { - max.cells = ncol(counts) + maxCells = ncol(counts) } ## Select a subset of cells to sample if greater than 'maxCells' diff --git a/R/celda_CG.R b/R/celda_CG.R index 6e4aba9b..d3ebaf7b 100755 --- a/R/celda_CG.R +++ b/R/celda_CG.R @@ -1482,7 +1482,7 @@ setMethod("celdaUmap", modules = NULL) { ## Checking if maxCells and minClusterSize will work - if(!is.null(max.cells)) { + if(!is.null(maxCells)) { if ((maxCells < ncol(counts)) & (maxCells / minClusterSize < params(celdaMod)$K)) { @@ -1496,7 +1496,7 @@ setMethod("celdaUmap", " decreasing 'minClusterSize'.") } } else { - max.cells = ncol(counts) + maxCells = ncol(counts) } fm <- factorizeMatrix(counts = counts, From 58a3c6ab099a16edf951ae84bc97411e3d78687b Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Thu, 22 Aug 2019 11:48:36 -0400 Subject: [PATCH 045/149] Fixed lintr formatting --- R/celda_C.R | 6 +++--- R/celda_CG.R | 8 ++++---- R/celda_G.R | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/celda_C.R b/R/celda_C.R index 9f578edd..b90c49bf 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -1211,7 +1211,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), compareCountMatrix(counts, celdaMod) ## Checking if maxCells and minClusterSize will work - if(!is.null(maxCells)) { + if (!is.null(maxCells)) { if ((maxCells < ncol(counts)) & (maxCells / minClusterSize < params(celdaMod)$K)) { @@ -1225,8 +1225,8 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), " 'minClusterSize'.") } } else { - maxCells = ncol(counts) - } + maxCells <- ncol(counts) + } ## Select a subset of cells to sample if greater than 'maxCells' totalCellsToRemove <- ncol(counts) - maxCells diff --git a/R/celda_CG.R b/R/celda_CG.R index d3ebaf7b..94a94b96 100755 --- a/R/celda_CG.R +++ b/R/celda_CG.R @@ -1482,7 +1482,7 @@ setMethod("celdaUmap", modules = NULL) { ## Checking if maxCells and minClusterSize will work - if(!is.null(maxCells)) { + if (!is.null(maxCells)) { if ((maxCells < ncol(counts)) & (maxCells / minClusterSize < params(celdaMod)$K)) { @@ -1496,9 +1496,9 @@ setMethod("celdaUmap", " decreasing 'minClusterSize'.") } } else { - maxCells = ncol(counts) - } - + maxCells <- ncol(counts) + } + fm <- factorizeMatrix(counts = counts, celdaMod = celdaMod, type = "counts") diff --git a/R/celda_G.R b/R/celda_G.R index 0e7b3aea..27e6ff55 100755 --- a/R/celda_G.R +++ b/R/celda_G.R @@ -1173,7 +1173,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), if (is.null(maxCells) || maxCells > ncol(counts)) { maxCells <- ncol(counts) - cellIx = 1:ncol(counts) + cellIx <- 1:ncol(counts) } else { cellIx <- sample(seq(ncol(counts)), maxCells) } From bd53e0fb56b515bc021f10e6dbac28a4e3bd8274 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Thu, 22 Aug 2019 22:37:35 -0400 Subject: [PATCH 046/149] Updated UMAP and tSNE functions. Enabled PCA on UMAP with celda_C results. Switched to 'uwot' package for UMAP generation --- NAMESPACE | 3 +- R/all_generics.R | 17 ++---- R/celda_C.R | 91 ++++++++++++++++++++++---------- R/celda_CG.R | 54 +++++++++++++++---- R/celda_G.R | 52 ++++++++++++++---- R/plot_dr.R | 43 +++++++++++---- man/celdaTsne-celda_C-method.Rd | 7 +-- man/celdaUmap-celda_C-method.Rd | 36 ++++++++++--- man/celdaUmap-celda_CG-method.Rd | 24 +++++++-- man/celdaUmap-celda_G-method.Rd | 26 +++++++-- man/celdaUmap.Rd | 24 +++------ 11 files changed, 267 insertions(+), 110 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index fdf6b708..401ad844 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -93,6 +93,7 @@ import(grid) import(gridExtra, except = c(combine)) import(magrittr) import(stats, except = c(start, end)) +import(uwot) importFrom(MAST,FromMatrix) importFrom(MAST,summary) importFrom(MAST,zlm) @@ -140,8 +141,6 @@ importFrom(scales,brewer_pal) importFrom(scales,dscale) importFrom(scales,hue_pal) importFrom(stringi,stri_list2matrix) -importFrom(umap,umap) -importFrom(umap,umap.defaults) importFrom(withr,with_seed) useDynLib(celda,"_colSumByGroup") useDynLib(celda,"_colSumByGroupChange") diff --git a/R/all_generics.R b/R/all_generics.R index 3c66133c..814de044 100755 --- a/R/all_generics.R +++ b/R/all_generics.R @@ -596,33 +596,26 @@ setGeneric("celdaTsne", #' requires more memory. Default 25000. #' @param minClusterSize Integer. Do not subsample cell clusters below this #' threshold. Default 100. -#' @param initialDims Integer. PCA will be used to reduce the dimentionality -#' of the dataset. The top 'initialDims' principal components will be used -#' for umap. Default 20. #' @param modules Integer vector. Determines which features modules to use for #' tSNE. If NULL, all modules will be used. Default NULL. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. -#' @param umapConfig An object of class "umapConfig" specifying parameters to #' the UMAP algorithm. -#' @return Numeric Matrix of dimension `ncol(counts)` x 2, colums representing -#' the "X" and "Y" coordinates in the data's t-SNE represetation. -#' @examples +#' @param ... Additional parameters to `uwot::umap` +#' @return A two column matrix of UMAP coordinates#' @examples #' data(celdaCGSim, celdaCGMod) -#' tsneRes <- celdaUmap(celdaCGSim$counts, celdaCGMod) -#' @importFrom umap umap.defaults +#' umapRes <- celdaUmap(celdaCGSim$counts, celdaCGMod) #' @export setGeneric("celdaUmap", signature = "celdaMod", function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, - initialDims = 20, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) { + ...) { standardGeneric("celdaUmap") }) diff --git a/R/celda_C.R b/R/celda_C.R index b90c49bf..581862ff 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -1042,8 +1042,6 @@ setMethod("celdaHeatmap", signature(celdaMod = "celda_C"), #' @param initialDims Integer. PCA will be used to reduce the dimentionality #' of the dataset. The top 'initialDims' principal components will be used #' for tSNE. Default 20. -#' @param modules Integer vector. Determines which features modules to use for -#' tSNE. If NULL, all modules will be used. Default NULL. #' @param perplexity Numeric. Perplexity parameter for tSNE. Default 20. #' @param maxIter Integer. Maximum number of iterations in tSNE generation. #' Default 2500. @@ -1063,7 +1061,6 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), maxCells = NULL, minClusterSize = 100, initialDims = 20, - modules = NULL, perplexity = 20, maxIter = 2500, seed = 12345) { @@ -1074,7 +1071,6 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), maxCells = maxCells, minClusterSize = minClusterSize, initialDims = initialDims, - modules = modules, perplexity = perplexity, maxIter = maxIter) } else { @@ -1084,7 +1080,6 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), maxCells = maxCells, minClusterSize = minClusterSize, initialDims = initialDims, - modules = modules, perplexity = perplexity, maxIter = maxIter)) } @@ -1098,15 +1093,13 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), maxCells = NULL, minClusterSize = 100, initialDims = 20, - modules = NULL, perplexity = 20, maxIter = 2500) { preparedCountInfo <- .prepareCountsForDimReductionCeldaC(counts, celdaMod, maxCells, - minClusterSize, - modules) + minClusterSize) res <- .calculateTsne(preparedCountInfo$norm, perplexity = perplexity, @@ -1117,7 +1110,7 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), final <- matrix(NA, nrow = ncol(counts), ncol = 2) final[preparedCountInfo$cellIx, ] <- res rownames(final) <- colnames(counts) - colnames(final) <- c("tsne_1", "tsne_2") + colnames(final) <- c("tSNE1", "tSNE_2") return(final) } @@ -1136,44 +1129,73 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), #' Default NULL. #' @param minClusterSize Integer. Do not subsample cell clusters below this #' threshold. Default 100. -#' @param modules Integer vector. Determines which features modules to use for -#' UMAP. If NULL, all modules will be used. Default NULL. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. -#' @param umapConfig An object of class "umap.config" specifying parameters to -#' the UMAP algorithm. +#' @param nNeighbors The size of local neighborhood used for +#' manifold approximation. Larger values result in more global +#' views of the manifold, while smaller values result in more +#' local data being preserved. Default 30. See `?uwot::umap` for more information. +#' @param minDist The effective minimum distance between embedded points. +#' Smaller values will result in a more clustered/clumped +#' embedding where nearby points on the manifold are drawn +#' closer together, while larger values will result on a more +#' even dispersal of points. Default 0.2. See `?uwot::umap` for more information. +#' @param spread The effective scale of embedded points. In combination with +#' ‘min_dist’, this determines how clustered/clumped the +#' embedded points are. Default 1. See `?uwot::umap` for more information. +#' @param pca Logical. Whether to perform +#' dimensionality reduction with PCA before UMAP. +#' @param initialDims Integer. Number of dimensions from PCA to use as +#' input in UMAP. Default 50. +#' @param nThreads Number of threads to use. Default 1. +#' @param ... Other parameters to pass to `uwot::umap`. #' @seealso `celda_C()` for clustering cells and `celdaHeatmap()` for displaying #' expression. #' @examples #' data(celdaCSim, celdaCMod) #' umapRes <- celdaUmap(celdaCSim$counts, celdaCMod) -#' @return A two column matrix of umap coordinates +#' @return A two column matrix of UMAP coordinates #' @export setMethod("celdaUmap", signature(celdaMod = "celda_C"), function(counts, celdaMod, maxCells = NULL, minClusterSize = 100, - modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) { + nNeighbors = 30, + minDist = 0.2, + spread = 1, + pca = TRUE, + initialDims = 50, + nThreads = 1, + ...) { if (is.null(seed)) { res <- .celdaUmapC(counts = counts, celdaMod = celdaMod, maxCells = maxCells, minClusterSize = minClusterSize, - modules = modules, - umapConfig = umapConfig) + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + pca = pca, + initialDims = initialDims, + nThreads = nThreads, + ...) } else { with_seed(seed, res <- .celdaUmapC(counts = counts, celdaMod = celdaMod, maxCells = maxCells, minClusterSize = minClusterSize, - modules = modules, - umapConfig = umapConfig)) + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + pca = pca, + initialDims = initialDims, + nThreads = nThreads, + ...)) } return(res) @@ -1184,19 +1206,31 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), celdaMod, maxCells = NULL, minClusterSize = 100, - modules = NULL, - umapConfig = umap::umap.defaults) { + nNeighbors = 30, + minDist = 0.2, + spread = 1, + pca = TRUE, + initialDims = 50, + nThreads = 1, + ...) { preparedCountInfo <- .prepareCountsForDimReductionCeldaC(counts, celdaMod, maxCells, - minClusterSize, - modules) - res <- .calculateUmap(preparedCountInfo$norm, umapConfig) + minClusterSize) + umapRes <- .calculateUmap(preparedCountInfo$norm, + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + pca = pca, + initialDims = initialDims, + nThreads = nThreads, + ...) + final <- matrix(NA, nrow = ncol(counts), ncol = 2) - final[preparedCountInfo$cellIx, ] <- res + final[preparedCountInfo$cellIx, ] <- umapRes rownames(final) <- colnames(counts) - colnames(final) <- c("umap_1", "umap_2") + colnames(final) <- c("UMAP_1", "UMAP_2") return(final) } @@ -1204,8 +1238,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), .prepareCountsForDimReductionCeldaC <- function(counts, celdaMod, maxCells = NULL, - minClusterSize = 100, - modules = NULL) { + minClusterSize = 100) { counts <- .processCounts(counts) compareCountMatrix(counts, celdaMod) diff --git a/R/celda_CG.R b/R/celda_CG.R index 94a94b96..516ea1ce 100755 --- a/R/celda_CG.R +++ b/R/celda_CG.R @@ -1389,7 +1389,7 @@ setMethod("celdaTsne", signature(celdaMod = "celda_CG"), final <- matrix(NA, nrow = ncol(counts), ncol = 2) final[preparedCountInfo$cellIx, ] <- res rownames(final) <- colnames(counts) - colnames(final) <- c("tsne_1", "tsne_2") + colnames(final) <- c("tSNE_1", "tSNE_2") return(final) } @@ -1411,12 +1411,24 @@ setMethod("celdaTsne", signature(celdaMod = "celda_CG"), #' @param minClusterSize Integer. Do not subsample cell clusters below this #' threshold. Default 100. #' @param modules Integer vector. Determines which features modules to use for -#' tSNE. If NULL, all modules will be used. Default NULL. +#' UMAP. If NULL, all modules will be used. Default NULL. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. -#' @param umapConfig Object of class `umap.config`. Configures parameters for -#' umap. Default `umap::umap.defaults`. +#' @param nNeighbors The size of local neighborhood used for +#' manifold approximation. Larger values result in more global +#' views of the manifold, while smaller values result in more +#' local data being preserved. Default 30. See `?uwot::umap` for more information. +#' @param minDist The effective minimum distance between embedded points. +#' Smaller values will result in a more clustered/clumped +#' embedding where nearby points on the manifold are drawn +#' closer together, while larger values will result on a more +#' even dispersal of points. Default 0.2. See `?uwot::umap` for more information. +#' @param spread The effective scale of embedded points. In combination with +#' ‘min_dist’, this determines how clustered/clumped the +#' embedded points are. Default 1. See `?uwot::umap` for more information. +#' @param nThreads Number of threads to use. Default 1. +#' @param ... Other parameters to pass to `uwot::umap`. #' @seealso `celda_CG()` for clustering features and cells and `celdaHeatmap()` #' for displaying expression. #' @examples @@ -1431,7 +1443,11 @@ setMethod("celdaUmap", minClusterSize = 100, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) { + nNeighbors = 30, + minDist = 0.2, + spread = 1, + nThreads = 1, + ...) { if (is.null(seed)) { res <- .celdaUmapCG(counts = counts, @@ -1439,7 +1455,11 @@ setMethod("celdaUmap", maxCells = maxCells, minClusterSize = minClusterSize, modules = modules, - umapConfig = umapConfig) + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + nThreads = nThreads, + ...) } else { with_seed(seed, res <- .celdaUmapCG(counts = counts, @@ -1447,7 +1467,11 @@ setMethod("celdaUmap", maxCells = maxCells, minClusterSize = minClusterSize, modules = modules, - umapConfig = umapConfig)) + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + nThreads = nThreads, + ...)) } return(res) @@ -1459,18 +1483,28 @@ setMethod("celdaUmap", maxCells = NULL, minClusterSize = 100, modules = NULL, - umapConfig = umap::umap.defaults) { + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + nThreads = nThreads, + ...) { preparedCountInfo <- .prepareCountsForDimReductionCeldaCG(counts, celdaMod, maxCells, minClusterSize, modules) - umapRes <- .calculateUmap(preparedCountInfo$norm, umapConfig) + umapRes <- .calculateUmap(preparedCountInfo$norm, + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + nThreads = nThreads, + ...) + final <- matrix(NA, nrow = ncol(counts), ncol = 2) final[preparedCountInfo$cellIx, ] <- umapRes rownames(final) <- colnames(counts) - colnames(final) <- c("umap_1", "umap_2") + colnames(final) <- c("UMAP_1", "UMAP_2") return(final) } diff --git a/R/celda_G.R b/R/celda_G.R index 27e6ff55..851660b5 100755 --- a/R/celda_G.R +++ b/R/celda_G.R @@ -1101,12 +1101,24 @@ setMethod("celdaTsne", signature(celdaMod = "celda_G"), #' @param minClusterSize Integer. Do not subsample cell clusters below this #' threshold. Default 100. #' @param modules Integer vector. Determines which features modules to use for -#' tSNE. If NULL, all modules will be used. Default NULL. +#' UMAP. If NULL, all modules will be used. Default NULL. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. -#' @param umapConfig Object of class `umap.config`. Configures parameters for -#' umap. Default `umap::umap.defaults`. +#' @param nNeighbors The size of local neighborhood used for +#' manifold approximation. Larger values result in more global +#' views of the manifold, while smaller values result in more +#' local data being preserved. Default 30. See `?uwot::umap` for more information. +#' @param minDist The effective minimum distance between embedded points. +#' Smaller values will result in a more clustered/clumped +#' embedding where nearby points on the manifold are drawn +#' closer together, while larger values will result on a more +#' even dispersal of points. Default 0.2. See `?uwot::umap` for more information. +#' @param spread The effective scale of embedded points. In combination with +#' ‘min_dist’, this determines how clustered/clumped the +#' embedded points are. Default 1. See `?uwot::umap` for more information. +#' @param nThreads Number of threads to use. Default 1. +#' @param ... Other parameters to pass to `uwot::umap`. #' @seealso `celda_G()` for clustering features and cells and `celdaHeatmap()` #' for displaying expression #' @examples @@ -1117,11 +1129,15 @@ setMethod("celdaTsne", signature(celdaMod = "celda_G"), setMethod("celdaUmap", signature(celdaMod = "celda_G"), function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) { + nNeighbors = 30, + minDist = 0.2, + spread = 1, + nThreads = 1, + ...) { if (is.null(seed)) { res <- .celdaUmapG(counts = counts, @@ -1129,7 +1145,11 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), maxCells = maxCells, minClusterSize = minClusterSize, modules = modules, - umapConfig = umapConfig) + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + nThreads = nThreads, + ...) } else { with_seed(seed, res <- .celdaUmapG(counts = counts, @@ -1137,7 +1157,11 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), maxCells = maxCells, minClusterSize = minClusterSize, modules = modules, - umapConfig = umapConfig)) + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + nThreads = nThreads, + ...)) } return(res) @@ -1149,14 +1173,24 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), maxCells = NULL, minClusterSize = 100, modules = NULL, - umapConfig = umap::umap.defaults) { + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + nThreads = nThreads, + ...) { preparedCountInfo <- .prepareCountsForDimReductionCeldaCG(counts, celdaMod, maxCells, minClusterSize, modules) - umapRes <- .calculateUmap(preparedCountInfo$norm, umapConfig) + umapRes <- .calculateUmap(preparedCountInfo$norm, + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + nThreads = nThreads, + ...) + final <- matrix(NA, nrow = ncol(counts), ncol = 2) final[preparedCountInfo$cellIx, ] <- umapRes rownames(final) <- colnames(counts) diff --git a/R/plot_dr.R b/R/plot_dr.R index cb4326df..e19a187c 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -445,14 +445,14 @@ plotDimReduceCluster <- function(dim1, # @param maxIter Numeric vector. Determines iterations for tsne. Default 1000. # @param doPca Logical. Whether to perform # dimensionality reduction with PCA before tSNE. -# @param initialDims Integer.Number of dimensions from PCA to use as -# input in tSNE. +# @param initialDims Integer. Number of dimensions from PCA to use as +# input in tSNE. Default 50. #' @importFrom Rtsne Rtsne .calculateTsne <- function(norm, perplexity = 20, maxIter = 2500, doPca = FALSE, - initialDims = 20) { + initialDims = 50) { res <- Rtsne::Rtsne( norm, @@ -467,12 +467,35 @@ plotDimReduceCluster <- function(dim1, } -# Run the umap algorithm for dimensionality reduction +# Run the UMAP algorithm for dimensionality reduction # @param norm Normalized count matrix. -# @param umapConfig An object of class umap.config, -# containing configuration parameters to be passed to umap. -# Default umap::umap.defualts. -#' @importFrom umap umap -.calculateUmap <- function(norm, umapConfig = umap::umap.defaults) { - return(umap::umap(norm, umapConfig)$layout) +# @param nNeighbors The size of local neighborhood used for +# manifold approximation. Larger values result in more global +# views of the manifold, while smaller values result in more +# local data being preserved. Default 30. See `?uwot::umap` for more information. +# @param minDist The effective minimum distance between embedded points. +# Smaller values will result in a more clustered/clumped +# embedding where nearby points on the manifold are drawn +# closer together, while larger values will result on a more +# even dispersal of points. Default 0.2. See `?uwot::umap` for more information. +# @param spread The effective scale of embedded points. In combination with +# ‘min_dist’, this determines how clustered/clumped the +# embedded points are. Default 1. See `?uwot::umap` for more information. +# @param pca Logical. Whether to perform +# dimensionality reduction with PCA before UMAP. +# @param initialDims Integer. Number of dimensions from PCA to use as +# input in UMAP. Default 50. +# @param nThreads Number of threads to use. Default 1. +# @param ... Other parameters to pass to `uwot::umap`. +#' @import uwot +.calculateUmap <- function(norm, nNeighbors = 30, minDist = 0.2, spread = 1, pca=FALSE, initialDims=50, nThreads = 1, ...) { + if (isTRUE(pca)) { + doPCA <- initialDims + } else { + doPCA <- NULL + } + + return(uwot::umap(norm, n_neighbors=nNeighbors, + min_dist = minDist, spread = spread, + n_threads = nThreads, n_sgd_threads = 1, pca = doPCA, ...)) } diff --git a/man/celdaTsne-celda_C-method.Rd b/man/celdaTsne-celda_C-method.Rd index a57ae2c4..6e6c9670 100644 --- a/man/celdaTsne-celda_C-method.Rd +++ b/man/celdaTsne-celda_C-method.Rd @@ -6,8 +6,8 @@ \title{tSNE for celda_C} \usage{ \S4method{celdaTsne}{celda_C}(counts, celdaMod, maxCells = NULL, - minClusterSize = 100, initialDims = 20, modules = NULL, - perplexity = 20, maxIter = 2500, seed = 12345) + minClusterSize = 100, initialDims = 20, perplexity = 20, + maxIter = 2500, seed = 12345) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -28,9 +28,6 @@ threshold. Default 100.} of the dataset. The top 'initialDims' principal components will be used for tSNE. Default 20.} -\item{modules}{Integer vector. Determines which features modules to use for -tSNE. If NULL, all modules will be used. Default NULL.} - \item{perplexity}{Numeric. Perplexity parameter for tSNE. Default 20.} \item{maxIter}{Integer. Maximum number of iterations in tSNE generation. diff --git a/man/celdaUmap-celda_C-method.Rd b/man/celdaUmap-celda_C-method.Rd index da4e2eeb..9023aae0 100644 --- a/man/celdaUmap-celda_C-method.Rd +++ b/man/celdaUmap-celda_C-method.Rd @@ -6,8 +6,9 @@ \title{umap for celda_C} \usage{ \S4method{celdaUmap}{celda_C}(counts, celdaMod, maxCells = NULL, - minClusterSize = 100, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) + minClusterSize = 100, seed = 12345, nNeighbors = 30, + minDist = 0.2, spread = 1, pca = TRUE, initialDims = 50, + nThreads = 1, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -24,18 +25,37 @@ Default NULL.} \item{minClusterSize}{Integer. Do not subsample cell clusters below this threshold. Default 100.} -\item{modules}{Integer vector. Determines which features modules to use for -UMAP. If NULL, all modules will be used. Default NULL.} - \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} -\item{umapConfig}{An object of class "umap.config" specifying parameters to -the UMAP algorithm.} +\item{nNeighbors}{The size of local neighborhood used for +manifold approximation. Larger values result in more global +views of the manifold, while smaller values result in more +local data being preserved. Default 30. See `?uwot::umap` for more information.} + +\item{minDist}{The effective minimum distance between embedded points. +Smaller values will result in a more clustered/clumped +embedding where nearby points on the manifold are drawn +closer together, while larger values will result on a more +even dispersal of points. Default 0.2. See `?uwot::umap` for more information.} + +\item{spread}{The effective scale of embedded points. In combination with +‘min_dist’, this determines how clustered/clumped the +embedded points are. Default 1. See `?uwot::umap` for more information.} + +\item{pca}{Logical. Whether to perform +dimensionality reduction with PCA before UMAP.} + +\item{initialDims}{Integer. Number of dimensions from PCA to use as +input in UMAP. Default 50.} + +\item{nThreads}{Number of threads to use. Default 1.} + +\item{...}{Other parameters to pass to `uwot::umap`.} } \value{ -A two column matrix of umap coordinates +A two column matrix of UMAP coordinates } \description{ Embeds cells in two dimensions using umap based on a `celda_C` diff --git a/man/celdaUmap-celda_CG-method.Rd b/man/celdaUmap-celda_CG-method.Rd index 1aa91ab7..bf819375 100644 --- a/man/celdaUmap-celda_CG-method.Rd +++ b/man/celdaUmap-celda_CG-method.Rd @@ -7,7 +7,7 @@ \usage{ \S4method{celdaUmap}{celda_CG}(counts, celdaMod, maxCells = NULL, minClusterSize = 100, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) + nNeighbors = 30, minDist = 0.2, spread = 1, nThreads = 1, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -25,14 +25,30 @@ Default NULL.} threshold. Default 100.} \item{modules}{Integer vector. Determines which features modules to use for -tSNE. If NULL, all modules will be used. Default NULL.} +UMAP. If NULL, all modules will be used. Default NULL.} \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} -\item{umapConfig}{Object of class `umap.config`. Configures parameters for -umap. Default `umap::umap.defaults`.} +\item{nNeighbors}{The size of local neighborhood used for +manifold approximation. Larger values result in more global +views of the manifold, while smaller values result in more +local data being preserved. Default 30. See `?uwot::umap` for more information.} + +\item{minDist}{The effective minimum distance between embedded points. +Smaller values will result in a more clustered/clumped +embedding where nearby points on the manifold are drawn +closer together, while larger values will result on a more +even dispersal of points. Default 0.2. See `?uwot::umap` for more information.} + +\item{spread}{The effective scale of embedded points. In combination with +‘min_dist’, this determines how clustered/clumped the +embedded points are. Default 1. See `?uwot::umap` for more information.} + +\item{nThreads}{Number of threads to use. Default 1.} + +\item{...}{Other parameters to pass to `uwot::umap`.} } \value{ A two column matrix of umap coordinates diff --git a/man/celdaUmap-celda_G-method.Rd b/man/celdaUmap-celda_G-method.Rd index 5dd72310..87f950d4 100644 --- a/man/celdaUmap-celda_G-method.Rd +++ b/man/celdaUmap-celda_G-method.Rd @@ -5,9 +5,9 @@ \alias{celdaUmap,celda_G-method} \title{umap for celda_G} \usage{ -\S4method{celdaUmap}{celda_G}(counts, celdaMod, maxCells = 25000, +\S4method{celdaUmap}{celda_G}(counts, celdaMod, maxCells = NULL, minClusterSize = 100, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) + nNeighbors = 30, minDist = 0.2, spread = 1, nThreads = 1, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -25,14 +25,30 @@ Default NULL.} threshold. Default 100.} \item{modules}{Integer vector. Determines which features modules to use for -tSNE. If NULL, all modules will be used. Default NULL.} +UMAP. If NULL, all modules will be used. Default NULL.} \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} -\item{umapConfig}{Object of class `umap.config`. Configures parameters for -umap. Default `umap::umap.defaults`.} +\item{nNeighbors}{The size of local neighborhood used for +manifold approximation. Larger values result in more global +views of the manifold, while smaller values result in more +local data being preserved. Default 30. See `?uwot::umap` for more information.} + +\item{minDist}{The effective minimum distance between embedded points. +Smaller values will result in a more clustered/clumped +embedding where nearby points on the manifold are drawn +closer together, while larger values will result on a more +even dispersal of points. Default 0.2. See `?uwot::umap` for more information.} + +\item{spread}{The effective scale of embedded points. In combination with +‘min_dist’, this determines how clustered/clumped the +embedded points are. Default 1. See `?uwot::umap` for more information.} + +\item{nThreads}{Number of threads to use. Default 1.} + +\item{...}{Other parameters to pass to `uwot::umap`.} } \value{ A two column matrix of umap coordinates diff --git a/man/celdaUmap.Rd b/man/celdaUmap.Rd index af6667b6..e0a5dcd6 100644 --- a/man/celdaUmap.Rd +++ b/man/celdaUmap.Rd @@ -4,9 +4,8 @@ \alias{celdaUmap} \title{Embeds cells in two dimensions using umap.} \usage{ -celdaUmap(counts, celdaMod, maxCells = 25000, minClusterSize = 100, - initialDims = 20, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) +celdaUmap(counts, celdaMod, maxCells = NULL, minClusterSize = 100, + modules = NULL, seed = 12345, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -22,28 +21,21 @@ requires more memory. Default 25000.} \item{minClusterSize}{Integer. Do not subsample cell clusters below this threshold. Default 100.} -\item{initialDims}{Integer. PCA will be used to reduce the dimentionality -of the dataset. The top 'initialDims' principal components will be used -for umap. Default 20.} - \item{modules}{Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL.} \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to -\link[withr]{with_seed} are made.} - -\item{umapConfig}{An object of class "umapConfig" specifying parameters to +\link[withr]{with_seed} are made. the UMAP algorithm.} + +\item{...}{Additional parameters to `uwot::umap`} } \value{ -Numeric Matrix of dimension `ncol(counts)` x 2, colums representing - the "X" and "Y" coordinates in the data's t-SNE represetation. +A two column matrix of UMAP coordinates#' @examples +data(celdaCGSim, celdaCGMod) +umapRes <- celdaUmap(celdaCGSim$counts, celdaCGMod) } \description{ Embeds cells in two dimensions using umap. } -\examples{ -data(celdaCGSim, celdaCGMod) -tsneRes <- celdaUmap(celdaCGSim$counts, celdaCGMod) -} From 1d079f6a5f1e45bfcfff7daca9ed23cf5a81e386 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Thu, 22 Aug 2019 22:57:50 -0400 Subject: [PATCH 047/149] Updated DESCRIPTION with uwot --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0afaf469..cd002722 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -34,7 +34,7 @@ Imports: data.table, Rcpp, RcppEigen, - umap, + uwot, enrichR, stringi, SummarizedExperiment, From 0e481a8cbcfb475f091338161095f3e35f5b32df Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Thu, 22 Aug 2019 23:43:28 -0400 Subject: [PATCH 048/149] Changed default umap parameters and nThread to cores to be consistent with rest of package --- R/celda_C.R | 14 +++++++------- R/celda_CG.R | 14 +++++++------- R/celda_G.R | 12 ++++++------ R/plot_dr.R | 6 +++--- man/celdaUmap-celda_C-method.Rd | 6 +++--- man/celdaUmap-celda_CG-method.Rd | 4 ++-- man/celdaUmap-celda_G-method.Rd | 4 ++-- 7 files changed, 30 insertions(+), 30 deletions(-) diff --git a/R/celda_C.R b/R/celda_C.R index 581862ff..a4fdccb9 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -1148,7 +1148,7 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), #' dimensionality reduction with PCA before UMAP. #' @param initialDims Integer. Number of dimensions from PCA to use as #' input in UMAP. Default 50. -#' @param nThreads Number of threads to use. Default 1. +#' @param cores Number of threads to use. Default 1. #' @param ... Other parameters to pass to `uwot::umap`. #' @seealso `celda_C()` for clustering cells and `celdaHeatmap()` for displaying #' expression. @@ -1164,11 +1164,11 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), minClusterSize = 100, seed = 12345, nNeighbors = 30, - minDist = 0.2, + minDist = 0.75, spread = 1, pca = TRUE, initialDims = 50, - nThreads = 1, + cores = 1, ...) { if (is.null(seed)) { @@ -1181,7 +1181,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), spread = spread, pca = pca, initialDims = initialDims, - nThreads = nThreads, + cores = cores, ...) } else { with_seed(seed, @@ -1194,7 +1194,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), spread = spread, pca = pca, initialDims = initialDims, - nThreads = nThreads, + cores = cores, ...)) } @@ -1211,7 +1211,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), spread = 1, pca = TRUE, initialDims = 50, - nThreads = 1, + cores = 1, ...) { preparedCountInfo <- .prepareCountsForDimReductionCeldaC(counts, @@ -1224,7 +1224,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), spread = spread, pca = pca, initialDims = initialDims, - nThreads = nThreads, + cores = cores, ...) final <- matrix(NA, nrow = ncol(counts), ncol = 2) diff --git a/R/celda_CG.R b/R/celda_CG.R index 516ea1ce..06651328 100755 --- a/R/celda_CG.R +++ b/R/celda_CG.R @@ -1427,7 +1427,7 @@ setMethod("celdaTsne", signature(celdaMod = "celda_CG"), #' @param spread The effective scale of embedded points. In combination with #' ‘min_dist’, this determines how clustered/clumped the #' embedded points are. Default 1. See `?uwot::umap` for more information. -#' @param nThreads Number of threads to use. Default 1. +#' @param cores Number of threads to use. Default 1. #' @param ... Other parameters to pass to `uwot::umap`. #' @seealso `celda_CG()` for clustering features and cells and `celdaHeatmap()` #' for displaying expression. @@ -1444,9 +1444,9 @@ setMethod("celdaUmap", modules = NULL, seed = 12345, nNeighbors = 30, - minDist = 0.2, + minDist = 0.75, spread = 1, - nThreads = 1, + cores = 1, ...) { if (is.null(seed)) { @@ -1458,7 +1458,7 @@ setMethod("celdaUmap", nNeighbors = nNeighbors, minDist = minDist, spread = spread, - nThreads = nThreads, + cores = cores, ...) } else { with_seed(seed, @@ -1470,7 +1470,7 @@ setMethod("celdaUmap", nNeighbors = nNeighbors, minDist = minDist, spread = spread, - nThreads = nThreads, + cores = cores, ...)) } @@ -1486,7 +1486,7 @@ setMethod("celdaUmap", nNeighbors = nNeighbors, minDist = minDist, spread = spread, - nThreads = nThreads, + cores = cores, ...) { preparedCountInfo <- .prepareCountsForDimReductionCeldaCG(counts, @@ -1498,7 +1498,7 @@ setMethod("celdaUmap", nNeighbors = nNeighbors, minDist = minDist, spread = spread, - nThreads = nThreads, + cores = cores, ...) final <- matrix(NA, nrow = ncol(counts), ncol = 2) diff --git a/R/celda_G.R b/R/celda_G.R index 851660b5..bddb5ad5 100755 --- a/R/celda_G.R +++ b/R/celda_G.R @@ -1117,7 +1117,7 @@ setMethod("celdaTsne", signature(celdaMod = "celda_G"), #' @param spread The effective scale of embedded points. In combination with #' ‘min_dist’, this determines how clustered/clumped the #' embedded points are. Default 1. See `?uwot::umap` for more information. -#' @param nThreads Number of threads to use. Default 1. +#' @param cores Number of threads to use. Default 1. #' @param ... Other parameters to pass to `uwot::umap`. #' @seealso `celda_G()` for clustering features and cells and `celdaHeatmap()` #' for displaying expression @@ -1136,7 +1136,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), nNeighbors = 30, minDist = 0.2, spread = 1, - nThreads = 1, + cores = 1, ...) { if (is.null(seed)) { @@ -1148,7 +1148,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), nNeighbors = nNeighbors, minDist = minDist, spread = spread, - nThreads = nThreads, + cores = cores, ...) } else { with_seed(seed, @@ -1160,7 +1160,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), nNeighbors = nNeighbors, minDist = minDist, spread = spread, - nThreads = nThreads, + cores = cores, ...)) } @@ -1176,7 +1176,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), nNeighbors = nNeighbors, minDist = minDist, spread = spread, - nThreads = nThreads, + cores = cores, ...) { preparedCountInfo <- .prepareCountsForDimReductionCeldaCG(counts, @@ -1188,7 +1188,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), nNeighbors = nNeighbors, minDist = minDist, spread = spread, - nThreads = nThreads, + cores = cores, ...) final <- matrix(NA, nrow = ncol(counts), ncol = 2) diff --git a/R/plot_dr.R b/R/plot_dr.R index e19a187c..a339ca15 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -485,10 +485,10 @@ plotDimReduceCluster <- function(dim1, # dimensionality reduction with PCA before UMAP. # @param initialDims Integer. Number of dimensions from PCA to use as # input in UMAP. Default 50. -# @param nThreads Number of threads to use. Default 1. +# @param cores Number of threads to use. Default 1. # @param ... Other parameters to pass to `uwot::umap`. #' @import uwot -.calculateUmap <- function(norm, nNeighbors = 30, minDist = 0.2, spread = 1, pca=FALSE, initialDims=50, nThreads = 1, ...) { +.calculateUmap <- function(norm, nNeighbors = 30, minDist = 0.75, spread = 1, pca=FALSE, initialDims=50, cores = 1, ...) { if (isTRUE(pca)) { doPCA <- initialDims } else { @@ -497,5 +497,5 @@ plotDimReduceCluster <- function(dim1, return(uwot::umap(norm, n_neighbors=nNeighbors, min_dist = minDist, spread = spread, - n_threads = nThreads, n_sgd_threads = 1, pca = doPCA, ...)) + n_threads = cores, n_sgd_threads = 1, pca = doPCA, ...)) } diff --git a/man/celdaUmap-celda_C-method.Rd b/man/celdaUmap-celda_C-method.Rd index 9023aae0..60e51cf0 100644 --- a/man/celdaUmap-celda_C-method.Rd +++ b/man/celdaUmap-celda_C-method.Rd @@ -7,8 +7,8 @@ \usage{ \S4method{celdaUmap}{celda_C}(counts, celdaMod, maxCells = NULL, minClusterSize = 100, seed = 12345, nNeighbors = 30, - minDist = 0.2, spread = 1, pca = TRUE, initialDims = 50, - nThreads = 1, ...) + minDist = 0.75, spread = 1, pca = TRUE, initialDims = 50, + cores = 1, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -50,7 +50,7 @@ dimensionality reduction with PCA before UMAP.} \item{initialDims}{Integer. Number of dimensions from PCA to use as input in UMAP. Default 50.} -\item{nThreads}{Number of threads to use. Default 1.} +\item{cores}{Number of threads to use. Default 1.} \item{...}{Other parameters to pass to `uwot::umap`.} } diff --git a/man/celdaUmap-celda_CG-method.Rd b/man/celdaUmap-celda_CG-method.Rd index bf819375..669108d9 100644 --- a/man/celdaUmap-celda_CG-method.Rd +++ b/man/celdaUmap-celda_CG-method.Rd @@ -7,7 +7,7 @@ \usage{ \S4method{celdaUmap}{celda_CG}(counts, celdaMod, maxCells = NULL, minClusterSize = 100, modules = NULL, seed = 12345, - nNeighbors = 30, minDist = 0.2, spread = 1, nThreads = 1, ...) + nNeighbors = 30, minDist = 0.75, spread = 1, cores = 1, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -46,7 +46,7 @@ even dispersal of points. Default 0.2. See `?uwot::umap` for more information.} ‘min_dist’, this determines how clustered/clumped the embedded points are. Default 1. See `?uwot::umap` for more information.} -\item{nThreads}{Number of threads to use. Default 1.} +\item{cores}{Number of threads to use. Default 1.} \item{...}{Other parameters to pass to `uwot::umap`.} } diff --git a/man/celdaUmap-celda_G-method.Rd b/man/celdaUmap-celda_G-method.Rd index 87f950d4..1ef1ccc8 100644 --- a/man/celdaUmap-celda_G-method.Rd +++ b/man/celdaUmap-celda_G-method.Rd @@ -7,7 +7,7 @@ \usage{ \S4method{celdaUmap}{celda_G}(counts, celdaMod, maxCells = NULL, minClusterSize = 100, modules = NULL, seed = 12345, - nNeighbors = 30, minDist = 0.2, spread = 1, nThreads = 1, ...) + nNeighbors = 30, minDist = 0.2, spread = 1, cores = 1, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -46,7 +46,7 @@ even dispersal of points. Default 0.2. See `?uwot::umap` for more information.} ‘min_dist’, this determines how clustered/clumped the embedded points are. Default 1. See `?uwot::umap` for more information.} -\item{nThreads}{Number of threads to use. Default 1.} +\item{cores}{Number of threads to use. Default 1.} \item{...}{Other parameters to pass to `uwot::umap`.} } From 6d6cb8e04669685b4ebc0b18b4e8c96547a99dbb Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Thu, 22 Aug 2019 23:52:47 -0400 Subject: [PATCH 049/149] lintr fixes --- R/all_generics.R | 2 +- R/celda_C.R | 20 ++++++++++-------- R/celda_CG.R | 17 +++++++++------- R/celda_G.R | 19 +++++++++-------- R/plot_dr.R | 35 +++++++++++++++++++++----------- man/celdaUmap-celda_C-method.Rd | 6 ++++-- man/celdaUmap-celda_CG-method.Rd | 9 +++++--- man/celdaUmap-celda_G-method.Rd | 9 +++++--- 8 files changed, 72 insertions(+), 45 deletions(-) diff --git a/R/all_generics.R b/R/all_generics.R index 814de044..f367468c 100755 --- a/R/all_generics.R +++ b/R/all_generics.R @@ -615,7 +615,7 @@ setGeneric("celdaUmap", minClusterSize = 100, modules = NULL, seed = 12345, - ...) { + ...) { standardGeneric("celdaUmap") }) diff --git a/R/celda_C.R b/R/celda_C.R index a4fdccb9..e2a553f2 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -1135,15 +1135,17 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), #' @param nNeighbors The size of local neighborhood used for #' manifold approximation. Larger values result in more global #' views of the manifold, while smaller values result in more -#' local data being preserved. Default 30. See `?uwot::umap` for more information. +#' local data being preserved. Default 30. +#' See `?uwot::umap` for more information. #' @param minDist The effective minimum distance between embedded points. -#' Smaller values will result in a more clustered/clumped -#' embedding where nearby points on the manifold are drawn -#' closer together, while larger values will result on a more -#' even dispersal of points. Default 0.2. See `?uwot::umap` for more information. +#' Smaller values will result in a more clustered/clumped +#' embedding where nearby points on the manifold are drawn +#' closer together, while larger values will result on a more +#' even dispersal of points. Default 0.2. +#' See `?uwot::umap` for more information. #' @param spread The effective scale of embedded points. In combination with -#' ‘min_dist’, this determines how clustered/clumped the -#' embedded points are. Default 1. See `?uwot::umap` for more information. +#' ‘min_dist’, this determines how clustered/clumped the +#' embedded points are. Default 1. See `?uwot::umap` for more information. #' @param pca Logical. Whether to perform #' dimensionality reduction with PCA before UMAP. #' @param initialDims Integer. Number of dimensions from PCA to use as @@ -1168,7 +1170,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), spread = 1, pca = TRUE, initialDims = 50, - cores = 1, + cores = 1, ...) { if (is.null(seed)) { @@ -1223,7 +1225,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), minDist = minDist, spread = spread, pca = pca, - initialDims = initialDims, + initialDims = initialDims, cores = cores, ...) diff --git a/R/celda_CG.R b/R/celda_CG.R index 06651328..8f13132e 100755 --- a/R/celda_CG.R +++ b/R/celda_CG.R @@ -1418,15 +1418,18 @@ setMethod("celdaTsne", signature(celdaMod = "celda_CG"), #' @param nNeighbors The size of local neighborhood used for #' manifold approximation. Larger values result in more global #' views of the manifold, while smaller values result in more -#' local data being preserved. Default 30. See `?uwot::umap` for more information. +#' local data being preserved. Default 30. +#' See `?uwot::umap` for more information. #' @param minDist The effective minimum distance between embedded points. -#' Smaller values will result in a more clustered/clumped -#' embedding where nearby points on the manifold are drawn -#' closer together, while larger values will result on a more -#' even dispersal of points. Default 0.2. See `?uwot::umap` for more information. +#' Smaller values will result in a more clustered/clumped +#' embedding where nearby points on the manifold are drawn +#' closer together, while larger values will result on a more +#' even dispersal of points. Default 0.2. +#' See `?uwot::umap` for more information. #' @param spread The effective scale of embedded points. In combination with -#' ‘min_dist’, this determines how clustered/clumped the -#' embedded points are. Default 1. See `?uwot::umap` for more information. +#' ‘min_dist’, this determines how clustered/clumped the +#' embedded points are. Default 1. +#' See `?uwot::umap` for more information. #' @param cores Number of threads to use. Default 1. #' @param ... Other parameters to pass to `uwot::umap`. #' @seealso `celda_CG()` for clustering features and cells and `celdaHeatmap()` diff --git a/R/celda_G.R b/R/celda_G.R index bddb5ad5..ae6ae5bc 100755 --- a/R/celda_G.R +++ b/R/celda_G.R @@ -1108,15 +1108,18 @@ setMethod("celdaTsne", signature(celdaMod = "celda_G"), #' @param nNeighbors The size of local neighborhood used for #' manifold approximation. Larger values result in more global #' views of the manifold, while smaller values result in more -#' local data being preserved. Default 30. See `?uwot::umap` for more information. +#' local data being preserved. Default 30. +#' See `?uwot::umap` for more information. #' @param minDist The effective minimum distance between embedded points. -#' Smaller values will result in a more clustered/clumped -#' embedding where nearby points on the manifold are drawn -#' closer together, while larger values will result on a more -#' even dispersal of points. Default 0.2. See `?uwot::umap` for more information. +#' Smaller values will result in a more clustered/clumped +#' embedding where nearby points on the manifold are drawn +#' closer together, while larger values will result on a more +#' even dispersal of points. Default 0.2. +#' See `?uwot::umap` for more information. #' @param spread The effective scale of embedded points. In combination with -#' ‘min_dist’, this determines how clustered/clumped the -#' embedded points are. Default 1. See `?uwot::umap` for more information. +#' ‘min_dist’, this determines how clustered/clumped the +#' embedded points are. Default 1. +#' See `?uwot::umap` for more information. #' @param cores Number of threads to use. Default 1. #' @param ... Other parameters to pass to `uwot::umap`. #' @seealso `celda_G()` for clustering features and cells and `celdaHeatmap()` @@ -1190,7 +1193,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), spread = spread, cores = cores, ...) - + final <- matrix(NA, nrow = ncol(counts), ncol = 2) final[preparedCountInfo$cellIx, ] <- umapRes rownames(final) <- colnames(counts) diff --git a/R/plot_dr.R b/R/plot_dr.R index a339ca15..32be1803 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -472,15 +472,18 @@ plotDimReduceCluster <- function(dim1, # @param nNeighbors The size of local neighborhood used for # manifold approximation. Larger values result in more global # views of the manifold, while smaller values result in more -# local data being preserved. Default 30. See `?uwot::umap` for more information. +# local data being preserved. Default 30. +# See `?uwot::umap` for more information. # @param minDist The effective minimum distance between embedded points. -# Smaller values will result in a more clustered/clumped -# embedding where nearby points on the manifold are drawn -# closer together, while larger values will result on a more -# even dispersal of points. Default 0.2. See `?uwot::umap` for more information. +# Smaller values will result in a more clustered/clumped +# embedding where nearby points on the manifold are drawn +# closer together, while larger values will result on a more +# even dispersal of points. Default 0.2. +# See `?uwot::umap` for more information. # @param spread The effective scale of embedded points. In combination with -# ‘min_dist’, this determines how clustered/clumped the -# embedded points are. Default 1. See `?uwot::umap` for more information. +# ‘min_dist’, this determines how clustered/clumped the +# embedded points are. Default 1. +# See `?uwot::umap` for more information. # @param pca Logical. Whether to perform # dimensionality reduction with PCA before UMAP. # @param initialDims Integer. Number of dimensions from PCA to use as @@ -488,14 +491,22 @@ plotDimReduceCluster <- function(dim1, # @param cores Number of threads to use. Default 1. # @param ... Other parameters to pass to `uwot::umap`. #' @import uwot -.calculateUmap <- function(norm, nNeighbors = 30, minDist = 0.75, spread = 1, pca=FALSE, initialDims=50, cores = 1, ...) { +.calculateUmap <- function(norm, + nNeighbors = 30, + minDist = 0.75, + spread = 1, + pca=FALSE, + initialDims = 50, + cores = 1, + ...) { if (isTRUE(pca)) { doPCA <- initialDims } else { doPCA <- NULL } - - return(uwot::umap(norm, n_neighbors=nNeighbors, - min_dist = minDist, spread = spread, - n_threads = cores, n_sgd_threads = 1, pca = doPCA, ...)) + + res <- uwot::umap(norm, n_neighbors=nNeighbors, + min_dist = minDist, spread = spread, + n_threads = cores, n_sgd_threads = 1, pca = doPCA, ...) + return(res) } diff --git a/man/celdaUmap-celda_C-method.Rd b/man/celdaUmap-celda_C-method.Rd index 60e51cf0..b2bfd4b2 100644 --- a/man/celdaUmap-celda_C-method.Rd +++ b/man/celdaUmap-celda_C-method.Rd @@ -32,13 +32,15 @@ a default value of 12345 is used. If NULL, no calls to \item{nNeighbors}{The size of local neighborhood used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more -local data being preserved. Default 30. See `?uwot::umap` for more information.} +local data being preserved. Default 30. +See `?uwot::umap` for more information.} \item{minDist}{The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more -even dispersal of points. Default 0.2. See `?uwot::umap` for more information.} +even dispersal of points. Default 0.2. +See `?uwot::umap` for more information.} \item{spread}{The effective scale of embedded points. In combination with ‘min_dist’, this determines how clustered/clumped the diff --git a/man/celdaUmap-celda_CG-method.Rd b/man/celdaUmap-celda_CG-method.Rd index 669108d9..2bfd3775 100644 --- a/man/celdaUmap-celda_CG-method.Rd +++ b/man/celdaUmap-celda_CG-method.Rd @@ -34,17 +34,20 @@ a default value of 12345 is used. If NULL, no calls to \item{nNeighbors}{The size of local neighborhood used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more -local data being preserved. Default 30. See `?uwot::umap` for more information.} +local data being preserved. Default 30. +See `?uwot::umap` for more information.} \item{minDist}{The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more -even dispersal of points. Default 0.2. See `?uwot::umap` for more information.} +even dispersal of points. Default 0.2. +See `?uwot::umap` for more information.} \item{spread}{The effective scale of embedded points. In combination with ‘min_dist’, this determines how clustered/clumped the -embedded points are. Default 1. See `?uwot::umap` for more information.} +embedded points are. Default 1. +See `?uwot::umap` for more information.} \item{cores}{Number of threads to use. Default 1.} diff --git a/man/celdaUmap-celda_G-method.Rd b/man/celdaUmap-celda_G-method.Rd index 1ef1ccc8..0a3aa2cf 100644 --- a/man/celdaUmap-celda_G-method.Rd +++ b/man/celdaUmap-celda_G-method.Rd @@ -34,17 +34,20 @@ a default value of 12345 is used. If NULL, no calls to \item{nNeighbors}{The size of local neighborhood used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more -local data being preserved. Default 30. See `?uwot::umap` for more information.} +local data being preserved. Default 30. +See `?uwot::umap` for more information.} \item{minDist}{The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more -even dispersal of points. Default 0.2. See `?uwot::umap` for more information.} +even dispersal of points. Default 0.2. +See `?uwot::umap` for more information.} \item{spread}{The effective scale of embedded points. In combination with ‘min_dist’, this determines how clustered/clumped the -embedded points are. Default 1. See `?uwot::umap` for more information.} +embedded points are. Default 1. +See `?uwot::umap` for more information.} \item{cores}{Number of threads to use. Default 1.} From add0826a6aae765599f73363c31ab281e4b3834e Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Fri, 23 Aug 2019 10:50:37 -0400 Subject: [PATCH 050/149] lintr fixes --- R/plot_dr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index 32be1803..3ff70c18 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -505,7 +505,7 @@ plotDimReduceCluster <- function(dim1, doPCA <- NULL } - res <- uwot::umap(norm, n_neighbors=nNeighbors, + res <- uwot::umap(norm, n_neighbors = nNeighbors, min_dist = minDist, spread = spread, n_threads = cores, n_sgd_threads = 1, pca = doPCA, ...) return(res) From d7f8817bd5a9100e11201e26ace5bf2e771c73b0 Mon Sep 17 00:00:00 2001 From: salzcamino Date: Wed, 28 Aug 2019 12:52:57 -0400 Subject: [PATCH 051/149] added ncol option for dim reduce plots --- R/plot_dr.R | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index cb4326df..7192e7db 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -48,6 +48,7 @@ plotDimReduceGrid <- function(dim1, colorMid, colorHigh, varLabel, + ncol = NULL, headers = NULL) { df <- data.frame(dim1, dim2, t(as.data.frame(matrix))) @@ -66,7 +67,6 @@ plotDimReduceGrid <- function(dim1, ggplot2::geom_point(stat = "identity", size = size, ggplot2::aes_string(color = varLabel)) + - ggplot2::facet_wrap(~ facet, labeller = headers) + ggplot2::theme_bw() + ggplot2::scale_colour_gradient2(low = colorLow, high = colorHigh, @@ -79,6 +79,11 @@ plotDimReduceGrid <- function(dim1, panel.spacing = unit(0, "lines"), panel.background = ggplot2::element_blank(), axis.line = ggplot2::element_line(colour = "black")) + if (isFALSE(is.null(ncol))){ + g <- g + ggplot2::facet_wrap(~ facet, labeller = headers, ncol = ncol) + } else { + g <- g + ggplot2::facet_wrap(~ facet, labeller = headers) + } } else { g <- ggplot2::ggplot(m, ggplot2::aes_string(x = xlab, y = ylab)) + @@ -98,6 +103,11 @@ plotDimReduceGrid <- function(dim1, panel.spacing = unit(0, "lines"), panel.background = ggplot2::element_blank(), axis.line = ggplot2::element_line(colour = "black")) + if (isFALSE(is.null(ncol))){ + g <- g + ggplot2::facet_wrap(~ facet, ncol = ncol) + } else { + g <- g + ggplot2::facet_wrap(~ facet) + } } return(g) } @@ -147,7 +157,7 @@ plotDimReduceGrid <- function(dim1, #' exactMatch = TRUE) #' } #' @export -plotDimReduceFeature <- function(dim1, +plotDimReduceFeature2 <- function(dim1, dim2, counts, features, @@ -160,7 +170,8 @@ plotDimReduceFeature <- function(dim1, ylab = "Dimension_2", colorLow = "grey", colorMid = NULL, - colorHigh = "blue") { + colorHigh = "blue", + ncol = NULL) { if (isFALSE(is.null(headers))) { if (length(headers) != length(features)) { @@ -250,6 +261,7 @@ plotDimReduceFeature <- function(dim1, colorMid, colorHigh, varLabel, + ncol, headers) } @@ -307,7 +319,8 @@ plotDimReduceModule <- ylab = "Dimension_2", colorLow = "grey", colorMid = NULL, - colorHigh = "blue") { + colorHigh = "blue", + ncol = NULL) { factorized <- factorizeMatrix(celdaMod = celdaMod, counts = counts) @@ -344,7 +357,8 @@ plotDimReduceModule <- colorLow, colorMid, colorHigh, - varLabel) + varLabel, + ncol) } From 73f505e04b2fd5cd3d80f0fdafcbb9b87fbd1dc3 Mon Sep 17 00:00:00 2001 From: salzcamino Date: Wed, 28 Aug 2019 12:55:04 -0400 Subject: [PATCH 052/149] added ncol option for dim reduce plots --- R/plot_dr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index 7192e7db..35eaf1b7 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -157,7 +157,7 @@ plotDimReduceGrid <- function(dim1, #' exactMatch = TRUE) #' } #' @export -plotDimReduceFeature2 <- function(dim1, +plotDimReduceFeature <- function(dim1, dim2, counts, features, From ed19ae07553b1fa16d3d7619496761b40d448e65 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Wed, 18 Sep 2019 10:33:08 -0400 Subject: [PATCH 053/149] cyclocomp_linter(complexity_limit = 40) --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6d7e52e3..d0f4a664 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,4 +24,4 @@ r_github_packages: after_success: - R CMD INSTALL $PKG_TARBALL - Rscript -e 'covr::coveralls()' - - Rscript -e 'library(lintr); lint_package(linters = with_defaults(object_length_linter(length = 40L), object_name_linter = NULL, commented_code_linter = NULL, object_usage_linter = NULL), exclusions = list("R/RcppExports.R"))' \ No newline at end of file + - Rscript -e 'library(lintr); lint_package(linters = with_defaults(object_length_linter(length = 40L), object_name_linter = NULL, commented_code_linter = NULL, object_usage_linter = NULL, cyclocomp_linter(complexity_limit = 40)), exclusions = list("R/RcppExports.R"))' \ No newline at end of file From c3f836a6c7139ba33b6788ca6327c2b3b511a0cb Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Thu, 19 Sep 2019 13:11:16 -0400 Subject: [PATCH 054/149] add empty line --- R/plotDendro.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/plotDendro.R b/R/plotDendro.R index 683574cf..4b8880d5 100644 --- a/R/plotDendro.R +++ b/R/plotDendro.R @@ -234,4 +234,4 @@ plotDendro <- function(decisionTree, } return(dendro) -} \ No newline at end of file +} From 926f61e5fe9724ffd334719790aa1ccf255e92af Mon Sep 17 00:00:00 2001 From: Irisapo Date: Mon, 14 Oct 2019 12:41:52 -0400 Subject: [PATCH 055/149] code reformat --- R/celda_G.R | 2 +- R/celda_heatmap.R | 2 +- R/decon.R | 2 +- R/findMarkers.R | 8 ++++---- R/getDecisions.R | 2 +- R/semi_pheatmap.R | 37 ++++++++++++++++++++----------------- 6 files changed, 28 insertions(+), 25 deletions(-) diff --git a/R/celda_G.R b/R/celda_G.R index ae6ae5bc..5f2b566a 100755 --- a/R/celda_G.R +++ b/R/celda_G.R @@ -1210,7 +1210,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), if (is.null(maxCells) || maxCells > ncol(counts)) { maxCells <- ncol(counts) - cellIx <- 1:ncol(counts) + cellIx <- seq_len(ncol(counts)) } else { cellIx <- sample(seq(ncol(counts)), maxCells) } diff --git a/R/celda_heatmap.R b/R/celda_heatmap.R index 7fc5da28..80fb4046 100644 --- a/R/celda_heatmap.R +++ b/R/celda_heatmap.R @@ -80,7 +80,7 @@ plotHeatmap <- function(counts, z = NULL, y = NULL, rowGroupOrder = NULL, - colGroupOrder = NULL, + colGroupOrder = NULL, scaleRow = scale, trim = c(-2, 2), featureIx = NULL, diff --git a/R/decon.R b/R/decon.R index f296e493..839c985b 100644 --- a/R/decon.R +++ b/R/decon.R @@ -750,7 +750,7 @@ addLogLikelihood <- function(llA, llB) { # transitional z label cbZ <- as.integer(plyr::mapvalues(cbZ, from = levels(cbZ), - to = 1:length(levels(cbZ)))) + to = seq_len(length(levels(cbZ))))) return(list( diff --git a/R/findMarkers.R b/R/findMarkers.R index b4405988..bf8ada3e 100644 --- a/R/findMarkers.R +++ b/R/findMarkers.R @@ -101,7 +101,7 @@ findMarkers <- function(features, stop("NA class values") } - if (any(is.na(features))){ + if (any(is.na(features))) { stop("NA feature values") } @@ -178,7 +178,7 @@ findMarkers <- function(features, dendro <- tree$dendro # Create separate trees for each cell type with more than one cluster - newTrees <- lapply(largeCellTypes, function(cellType){ + newTrees <- lapply(largeCellTypes, function(cellType) { # Print current status message("Building tree for cell type ", cellType) @@ -268,7 +268,7 @@ findMarkers <- function(features, cellTypeDendro <- newTrees[[cellType]]$dendro # Adjust labels, member count, and midpoint of nodes - dendro <- dendrapply(dendro, function(node){ + dendro <- dendrapply(dendro, function(node) { # Check if in right branch if (cellType %in% as.character(attributes(node)$classLabels)) { # Replace cell type label with subtype labels @@ -311,7 +311,7 @@ findMarkers <- function(features, cellTypeHeight <- attributes(cellTypeDendro)$height cellTypeDendro <- dendrapply(cellTypeDendro, function(node, parentHeight, cellTypeHeight) { - if (attributes(node)$height > 1){ + if (attributes(node)$height > 1) { attributes(node)$height <- parentHeight - 1 - (cellTypeHeight - attributes( node)$height) diff --git a/R/getDecisions.R b/R/getDecisions.R index 118be596..69ed7014 100644 --- a/R/getDecisions.R +++ b/R/getDecisions.R @@ -60,7 +60,7 @@ getDecisions <- function(rules, features) { ruleClass$sample <- samp[ruleClass$feature] # For multiple direction == 1, use one with the top stat - if (sum(ruleClass$direction == 1) > 1){ + if (sum(ruleClass$direction == 1) > 1) { ruleClass <- ruleClass[order( ruleClass$direction , decreasing = T), ] diff --git a/R/semi_pheatmap.R b/R/semi_pheatmap.R index f1426c12..adcc527e 100755 --- a/R/semi_pheatmap.R +++ b/R/semi_pheatmap.R @@ -755,7 +755,7 @@ vplayout <- function(x, y) { } # Omit border color if cell size is too small - if (mindim < 3){ + if (mindim < 3) { borderColor <- NA } @@ -898,8 +898,8 @@ vplayout <- function(x, y) { } # Draw annotation legend - annotation <- c(annotationCol[length(annotationCol):1], - annotationRow[length(annotationRow):1]) + annotation <- c(annotationCol[seq.int(from = length(annotationCol), to = 1)], + annotationRow[seq.int(from = length(annotationRow), to = 1)]) annotation <- annotation[unlist(lapply(annotation, function(x) !.is.na2(x)))] @@ -1494,7 +1494,7 @@ vplayout <- function(x, y) { #' clusteringDistanceCols = dcols) #' #' # Modify ordering of the clusters using clustering callback option -#' callback = function(hc, mat){ +#' callback = function(hc, mat) { #' sv = svd(t(mat))$v[, 1] #' dend = reorder(as.dendrogram(hc), wts = sv) #' as.hclust(dend) @@ -1552,7 +1552,7 @@ semiPheatmap <- function(mat, silent = FALSE, rowLabel, colLabel, - rowGroupOrder = NULL, + rowGroupOrder = NULL, colGroupOrder = NULL, ...) { @@ -1623,7 +1623,7 @@ semiPheatmap <- function(mat, rowLabel <- rep(1, nrow(mat)) } else { #o <- order(rowLabel) - o <- .Order(labels=rowLabel, groupOrder=rowGroupOrder) + o <- .Order(labels = rowLabel, groupOrder = rowGroupOrder) mat <- mat[o, , drop = FALSE] fmat <- fmat[o, , drop = FALSE] rowLabel <- rowLabel[o] @@ -1789,18 +1789,21 @@ semiPheatmap <- function(mat, -# order function that order the row/column labels based on the order of the group priority -# return value is a vector of the ordered index -# labels is a vector of any non-zero length -# groupOrder, a column named dataframe/matrix with the "groupName" column storing the group name and the "groupIndex" storing the group priority -.Order = function(labels, groupOrder=NULL){ - if (is.null(groupOrder)) { +# order function that order the row/column labels +# based on the order of the group priority +# return value is a vector of the ordered index +# labels is a vector of any non-zero length +# groupOrder, a column named dataframe/matrix +# with the "groupName" column storing the group +# name and the "groupIndex" storing the group priority +.Order <- function(labels, groupOrder=NULL) { + if (is.null(groupOrder)) { return(order(labels)) } else { - # Throw error is length(unique(labels)) != nrow(groupOrder) - - olabels = plyr::mapvalues(x=labels, from=groupOrder[,"groupName"], to=groupOrder[,"groupIndex"]) - olabels = as.integer(olabels) # Make sure the olabels is integer for order() function + # Throw error is length(unique(labels)) != nrow(groupOrder) + olabels <- plyr::mapvalues(x = labels, from = groupOrder[, "groupName"], + to = groupOrder[, "groupIndex"]) + olabels <- as.integer(olabels) # Make sure the olabels is integer for order() function return(order(olabels)) - } + } } From 7ad712896ab8d804342e00c213b22f6f0309e0c2 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Mon, 14 Oct 2019 13:14:47 -0400 Subject: [PATCH 056/149] code style reformat.... --- R/findMarkers.R | 2 +- R/getDecisions.R | 2 +- R/semi_pheatmap.R | 21 ++++++++++++--------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/R/findMarkers.R b/R/findMarkers.R index bf8ada3e..8d5a286c 100644 --- a/R/findMarkers.R +++ b/R/findMarkers.R @@ -204,7 +204,7 @@ findMarkers <- function(features, newLabels == cellType]))) # Adjust 'rules' table for new tree - newTree$rules <- lapply(newTree$rules, function(rules){ + newTree$rules <- lapply(newTree$rules, function(rules) { rules$level <- rules$level + max(tree$rules[[cellType]]$level) rules <- rbind(tree$rules[[cellType]], rules) }) diff --git a/R/getDecisions.R b/R/getDecisions.R index 69ed7014..c6d268a7 100644 --- a/R/getDecisions.R +++ b/R/getDecisions.R @@ -34,7 +34,7 @@ getDecisions <- function(rules, features) { } # Function to predict class from list of rules -.predictClass <- function(samp, rules){ +.predictClass <- function(samp, rules) { # Initilize possible classes and level classes <- names(rules) diff --git a/R/semi_pheatmap.R b/R/semi_pheatmap.R index adcc527e..cb2e42f4 100755 --- a/R/semi_pheatmap.R +++ b/R/semi_pheatmap.R @@ -898,7 +898,8 @@ vplayout <- function(x, y) { } # Draw annotation legend - annotation <- c(annotationCol[seq.int(from = length(annotationCol), to = 1)], + annotation <- c(annotationCol[seq.int( + from = length(annotationCol), to = 1)], annotationRow[seq.int(from = length(annotationRow), to = 1)]) annotation <- annotation[unlist(lapply(annotation, function(x) !.is.na2(x)))] @@ -1658,7 +1659,7 @@ semiPheatmap <- function(mat, colLabel <- rep(1, ncol(mat)) } else { #o <- order(colLabel) - o <- .Order(labels=colLabel, groupOrder=colGroupOrder) + o <- .Order(labels = colLabel, groupOrder = colGroupOrder) mat <- mat[, o, drop = FALSE] fmat <- fmat[, o, drop = FALSE] colLabel <- colLabel[o] @@ -1789,21 +1790,23 @@ semiPheatmap <- function(mat, -# order function that order the row/column labels +# order function that order the row/column labels # based on the order of the group priority # return value is a vector of the ordered index # labels is a vector of any non-zero length -# groupOrder, a column named dataframe/matrix -# with the "groupName" column storing the group +# groupOrder, a column named dataframe/matrix +# with the "groupName" column storing the group # name and the "groupIndex" storing the group priority .Order <- function(labels, groupOrder=NULL) { if (is.null(groupOrder)) { - return(order(labels)) + return(order(labels)) } else { # Throw error is length(unique(labels)) != nrow(groupOrder) - olabels <- plyr::mapvalues(x = labels, from = groupOrder[, "groupName"], - to = groupOrder[, "groupIndex"]) - olabels <- as.integer(olabels) # Make sure the olabels is integer for order() function + olabels <- plyr::mapvalues(x = labels, + from = groupOrder[, "groupName"], + to = groupOrder[, "groupIndex"]) + # Make sure the olabels is integer for order() function + olabels <- as.integer(olabels) return(order(olabels)) } } From 88dc78afda017d6f3c4908cee952dcd2d2a36ac8 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Mon, 14 Oct 2019 13:35:48 -0400 Subject: [PATCH 057/149] code style reformat.......... --- R/semi_pheatmap.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/semi_pheatmap.R b/R/semi_pheatmap.R index cb2e42f4..9f2e5e65 100755 --- a/R/semi_pheatmap.R +++ b/R/semi_pheatmap.R @@ -1806,7 +1806,7 @@ semiPheatmap <- function(mat, from = groupOrder[, "groupName"], to = groupOrder[, "groupIndex"]) # Make sure the olabels is integer for order() function - olabels <- as.integer(olabels) + olabels <- as.integer(olabels) return(order(olabels)) } } From 11772db7a3a708e7b16c15744f96990c1378fc01 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Tue, 15 Oct 2019 12:18:41 -0400 Subject: [PATCH 058/149] change plotDimReduceFeature default color scale --- R/plot_dr.R | 12 ++++++------ man/plotDimReduceFeature.Rd | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index 3ff70c18..03c9b797 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -129,11 +129,11 @@ plotDimReduceGrid <- function(dim1, #' @param xlab Character vector. Label for the x-axis. Default "Dimension_1". #' @param ylab Character vector. Label for the y-axis. Default "Dimension_2". #' @param colorLow Character. A color available from `colors()`. The color -#' will be used to signify the lowest values on the scale. Default 'grey'. +#' will be used to signify the lowest values on the scale. Default 'blue'. #' @param colorMid Character. A color available from `colors()`. The color -#' will be used to signify the midpoint on the scale. +#' will be used to signify the midpoint on the scale. Default 'white'. #' @param colorHigh Character. A color available from `colors()`. The color -#' will be used to signify the highest values on the scale. Default 'blue'. +#' will be used to signify the highest values on the scale. Default 'red'. #' @return The plot as a ggplot object #' @examples #' \donttest{ @@ -158,9 +158,9 @@ plotDimReduceFeature <- function(dim1, size = 1, xlab = "Dimension_1", ylab = "Dimension_2", - colorLow = "grey", - colorMid = NULL, - colorHigh = "blue") { + colorLow = "blue", + colorMid = "white", + colorHigh = "red") { if (isFALSE(is.null(headers))) { if (length(headers) != length(features)) { diff --git a/man/plotDimReduceFeature.Rd b/man/plotDimReduceFeature.Rd index e0152836..93065ad6 100644 --- a/man/plotDimReduceFeature.Rd +++ b/man/plotDimReduceFeature.Rd @@ -6,8 +6,8 @@ \usage{ plotDimReduceFeature(dim1, dim2, counts, features, headers = NULL, normalize = TRUE, exactMatch = TRUE, trim = c(-2, 2), size = 1, - xlab = "Dimension_1", ylab = "Dimension_2", colorLow = "grey", - colorMid = NULL, colorHigh = "blue") + xlab = "Dimension_1", ylab = "Dimension_2", colorLow = "blue", + colorMid = "white", colorHigh = "red") } \arguments{ \item{dim1}{Numeric vector. First dimension from data @@ -42,13 +42,13 @@ Set to NULL to disable. Default c(-2,2).} \item{ylab}{Character vector. Label for the y-axis. Default "Dimension_2".} \item{colorLow}{Character. A color available from `colors()`. The color -will be used to signify the lowest values on the scale. Default 'grey'.} +will be used to signify the lowest values on the scale. Default 'blue'.} \item{colorMid}{Character. A color available from `colors()`. The color -will be used to signify the midpoint on the scale.} +will be used to signify the midpoint on the scale. Default 'white'.} \item{colorHigh}{Character. A color available from `colors()`. The color -will be used to signify the highest values on the scale. Default 'blue'.} +will be used to signify the highest values on the scale. Default 'red'.} } \value{ The plot as a ggplot object From 7e6be57767d7af3754fe951ff013b0474ef8e980 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Tue, 15 Oct 2019 12:19:53 -0400 Subject: [PATCH 059/149] version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index cd002722..ddec3b80 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: celda Title: CEllular Latent Dirichlet Allocation -Version: 1.1.6 +Version: 1.1.7 Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), From 1eeb3bde3c741e7214f159b2ed8c2f207bc7f485 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Tue, 15 Oct 2019 12:54:35 -0400 Subject: [PATCH 060/149] fix lints --- R/celda_G.R | 2 +- R/decon.R | 17 ++++++----------- R/findMarkers.R | 10 +++++----- R/getDecisions.R | 4 ++-- R/semi_pheatmap.R | 8 ++++---- man/semiPheatmap.Rd | 2 +- 6 files changed, 19 insertions(+), 24 deletions(-) diff --git a/R/celda_G.R b/R/celda_G.R index ae6ae5bc..5f2b566a 100755 --- a/R/celda_G.R +++ b/R/celda_G.R @@ -1210,7 +1210,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), if (is.null(maxCells) || maxCells > ncol(counts)) { maxCells <- ncol(counts) - cellIx <- 1:ncol(counts) + cellIx <- seq_len(ncol(counts)) } else { cellIx <- sample(seq(ncol(counts)), maxCells) } diff --git a/R/decon.R b/R/decon.R index f296e493..db14ed97 100644 --- a/R/decon.R +++ b/R/decon.R @@ -710,8 +710,7 @@ addLogLikelihood <- function(llA, llB) { K <- ceiling(sqrt(nC)) } - globalZ <- - .initializeSplitZ( + globalZ <- .initializeSplitZ( counts, K = K, KSubcluster = NULL, @@ -722,7 +721,7 @@ addLogLikelihood <- function(llA, llB) { globalK <- max(globalZ) localZ <- rep(NA, nC) - for (k in 1:globalK) { + for (k in seq(globalK)) { if (sum(globalZ == k) > 2) { localCounts <- counts[, globalZ == k] localK <- min(K, ceiling(sqrt(ncol( @@ -741,16 +740,12 @@ addLogLikelihood <- function(llA, llB) { } } - - cbZ <- - interaction(globalZ, localZ, lex.order = TRUE, drop = TRUE) + cbZ <- interaction(globalZ, localZ, lex.order = TRUE, drop = TRUE) # combined z label - trZ <- - as.integer(sub("\\..*", "", levels(cbZ), perl = TRUE)) + trZ <- as.integer(sub("\\..*", "", levels(cbZ), perl = TRUE)) # transitional z label - cbZ <- - as.integer(plyr::mapvalues(cbZ, from = levels(cbZ), - to = 1:length(levels(cbZ)))) + cbZ <- as.integer(plyr::mapvalues(cbZ, from = levels(cbZ), + to = seq(length(levels(cbZ))))) return(list( diff --git a/R/findMarkers.R b/R/findMarkers.R index b4405988..8d5a286c 100644 --- a/R/findMarkers.R +++ b/R/findMarkers.R @@ -101,7 +101,7 @@ findMarkers <- function(features, stop("NA class values") } - if (any(is.na(features))){ + if (any(is.na(features))) { stop("NA feature values") } @@ -178,7 +178,7 @@ findMarkers <- function(features, dendro <- tree$dendro # Create separate trees for each cell type with more than one cluster - newTrees <- lapply(largeCellTypes, function(cellType){ + newTrees <- lapply(largeCellTypes, function(cellType) { # Print current status message("Building tree for cell type ", cellType) @@ -204,7 +204,7 @@ findMarkers <- function(features, newLabels == cellType]))) # Adjust 'rules' table for new tree - newTree$rules <- lapply(newTree$rules, function(rules){ + newTree$rules <- lapply(newTree$rules, function(rules) { rules$level <- rules$level + max(tree$rules[[cellType]]$level) rules <- rbind(tree$rules[[cellType]], rules) }) @@ -268,7 +268,7 @@ findMarkers <- function(features, cellTypeDendro <- newTrees[[cellType]]$dendro # Adjust labels, member count, and midpoint of nodes - dendro <- dendrapply(dendro, function(node){ + dendro <- dendrapply(dendro, function(node) { # Check if in right branch if (cellType %in% as.character(attributes(node)$classLabels)) { # Replace cell type label with subtype labels @@ -311,7 +311,7 @@ findMarkers <- function(features, cellTypeHeight <- attributes(cellTypeDendro)$height cellTypeDendro <- dendrapply(cellTypeDendro, function(node, parentHeight, cellTypeHeight) { - if (attributes(node)$height > 1){ + if (attributes(node)$height > 1) { attributes(node)$height <- parentHeight - 1 - (cellTypeHeight - attributes( node)$height) diff --git a/R/getDecisions.R b/R/getDecisions.R index 118be596..c6d268a7 100644 --- a/R/getDecisions.R +++ b/R/getDecisions.R @@ -34,7 +34,7 @@ getDecisions <- function(rules, features) { } # Function to predict class from list of rules -.predictClass <- function(samp, rules){ +.predictClass <- function(samp, rules) { # Initilize possible classes and level classes <- names(rules) @@ -60,7 +60,7 @@ getDecisions <- function(rules, features) { ruleClass$sample <- samp[ruleClass$feature] # For multiple direction == 1, use one with the top stat - if (sum(ruleClass$direction == 1) > 1){ + if (sum(ruleClass$direction == 1) > 1) { ruleClass <- ruleClass[order( ruleClass$direction , decreasing = T), ] diff --git a/R/semi_pheatmap.R b/R/semi_pheatmap.R index d977a828..f8169b53 100755 --- a/R/semi_pheatmap.R +++ b/R/semi_pheatmap.R @@ -755,7 +755,7 @@ vplayout <- function(x, y) { } # Omit border color if cell size is too small - if (mindim < 3){ + if (mindim < 3) { borderColor <- NA } @@ -898,8 +898,8 @@ vplayout <- function(x, y) { } # Draw annotation legend - annotation <- c(annotationCol[length(annotationCol):1], - annotationRow[length(annotationRow):1]) + annotation <- c(annotationCol[seq(length(annotationCol), 1)], + annotationRow[seq(length(annotationRow), 1)]) annotation <- annotation[unlist(lapply(annotation, function(x) !.is.na2(x)))] @@ -1494,7 +1494,7 @@ vplayout <- function(x, y) { #' clusteringDistanceCols = dcols) #' #' # Modify ordering of the clusters using clustering callback option -#' callback = function(hc, mat){ +#' callback = function(hc, mat) { #' sv = svd(t(mat))$v[, 1] #' dend = reorder(as.dendrogram(hc), wts = sv) #' as.hclust(dend) diff --git a/man/semiPheatmap.Rd b/man/semiPheatmap.Rd index fae1d21e..c2b23c42 100644 --- a/man/semiPheatmap.Rd +++ b/man/semiPheatmap.Rd @@ -276,7 +276,7 @@ pheatmap(test, clusteringDistanceCols = dcols) # Modify ordering of the clusters using clustering callback option -callback = function(hc, mat){ +callback = function(hc, mat) { sv = svd(t(mat))$v[, 1] dend = reorder(as.dendrogram(hc), wts = sv) as.hclust(dend) From 58f8538fcf9899219249d2cc8247e3a2738645d0 Mon Sep 17 00:00:00 2001 From: Nitesh Turaga Date: Tue, 29 Oct 2019 13:11:41 -0400 Subject: [PATCH 061/149] bump x.y.z version to even y prior to creation of RELEASE_3_10 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index ddec3b80..3e3b0c2d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: celda Title: CEllular Latent Dirichlet Allocation -Version: 1.1.7 +Version: 1.2.0 Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), From a4f3d2a28c72bdfa03073ffd6454946cc34a172a Mon Sep 17 00:00:00 2001 From: Nitesh Turaga Date: Tue, 29 Oct 2019 13:43:35 -0400 Subject: [PATCH 062/149] bump x.y.z version to odd y after creation of RELEASE_3_10 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3e3b0c2d..50353e95 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: celda Title: CEllular Latent Dirichlet Allocation -Version: 1.2.0 +Version: 1.3.0 Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), From 658ae9d4c0fc1f04489c54ce7f68ae5ffc63dab1 Mon Sep 17 00:00:00 2001 From: salzcamino Date: Tue, 12 Nov 2019 13:04:25 -0500 Subject: [PATCH 063/149] Added group.by param to plotDimReduceCluster --- R/plot_dr.R | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index 35eaf1b7..5fabf98f 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -382,6 +382,8 @@ plotDimReduceModule <- #' If NULL, all clusters will be colored. Default NULL. #' @param labelClusters Logical. Whether the cluster labels are plotted. #' Default FALSE. +#' @param group.by Character vector. Contains sample labels for each cell. +#' If NULL, all samples will be plotted together. Default NULL. #' @param labelSize Numeric. Sets size of label if labelClusters is TRUE. #' Default 3.5. #' @return The plot as a ggplot object @@ -405,9 +407,15 @@ plotDimReduceCluster <- function(dim1, ylab = "Dimension_2", specificClusters = NULL, labelClusters = FALSE, + group.by = NULL, labelSize = 3.5) { - df <- data.frame(dim1, dim2, cluster) - colnames(df) <- c(xlab, ylab, "Cluster") + if (!is.null(group.by)) { + df <- data.frame(dim1, dim2, cluster, group.by) + colnames(df) <- c(xlab, ylab, "Cluster", "Sample") + } else { + df <- data.frame(dim1, dim2, cluster) + colnames(df) <- c(xlab, ylab, "Cluster") + } naIx <- is.na(dim1) | is.na(dim2) df <- df[!naIx, ] df[3] <- as.factor(df[[3]]) @@ -449,6 +457,9 @@ plotDimReduceCluster <- function(dim1, mapping = ggplot2::aes_string(label = "Cluster"), size = labelSize) } + if (!is.null(x = group.by)) { + g <- g + facet_wrap(facets = vars(!!sym(x = "Sample"))) + theme(strip.background = element_blank()) + } return(g) } From 17df2b55540c3f3d8fe5d5b8d26678cc6c1b33e7 Mon Sep 17 00:00:00 2001 From: salzcamino Date: Tue, 12 Nov 2019 13:57:36 -0500 Subject: [PATCH 064/149] styling changes --- R/plot_dr.R | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index 5fabf98f..e3a41018 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -79,8 +79,9 @@ plotDimReduceGrid <- function(dim1, panel.spacing = unit(0, "lines"), panel.background = ggplot2::element_blank(), axis.line = ggplot2::element_line(colour = "black")) - if (isFALSE(is.null(ncol))){ - g <- g + ggplot2::facet_wrap(~ facet, labeller = headers, ncol = ncol) + if (isFALSE(is.null(ncol))) { + g <- g + ggplot2::facet_wrap(~ facet, labeller = headers, + ncol = ncol) } else { g <- g + ggplot2::facet_wrap(~ facet, labeller = headers) } @@ -103,7 +104,7 @@ plotDimReduceGrid <- function(dim1, panel.spacing = unit(0, "lines"), panel.background = ggplot2::element_blank(), axis.line = ggplot2::element_line(colour = "black")) - if (isFALSE(is.null(ncol))){ + if (isFALSE(is.null(ncol))) { g <- g + ggplot2::facet_wrap(~ facet, ncol = ncol) } else { g <- g + ggplot2::facet_wrap(~ facet) @@ -407,10 +408,10 @@ plotDimReduceCluster <- function(dim1, ylab = "Dimension_2", specificClusters = NULL, labelClusters = FALSE, - group.by = NULL, + groupBy = NULL, labelSize = 3.5) { - if (!is.null(group.by)) { - df <- data.frame(dim1, dim2, cluster, group.by) + if (!is.null(groupBy)) { + df <- data.frame(dim1, dim2, cluster, groupBy) colnames(df) <- c(xlab, ylab, "Cluster", "Sample") } else { df <- data.frame(dim1, dim2, cluster) @@ -457,8 +458,9 @@ plotDimReduceCluster <- function(dim1, mapping = ggplot2::aes_string(label = "Cluster"), size = labelSize) } - if (!is.null(x = group.by)) { - g <- g + facet_wrap(facets = vars(!!sym(x = "Sample"))) + theme(strip.background = element_blank()) + if (!is.null(x = groupBy)) { + g <- g + facet_wrap(facets = vars(!!sym(x = "Sample"))) + + theme(strip.background = element_blank()) } return(g) } From 9cf2ce0fdc44666853674dbc7e090a915ac94e5b Mon Sep 17 00:00:00 2001 From: salzcamino Date: Tue, 12 Nov 2019 14:03:21 -0500 Subject: [PATCH 065/149] styling changes --- R/plot_dr.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index 47393a78..76c1c1f5 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -459,8 +459,8 @@ plotDimReduceCluster <- function(dim1, size = labelSize) } if (!is.null(x = groupBy)) { - g <- g + facet_wrap(facets = vars(!!sym(x = "Sample"))) - + theme(strip.background = element_blank()) + g <- g + facet_wrap(facets = vars(!!sym(x = "Sample"))) + + theme(strip.background = element_blank()) } return(g) } From 75be85fd64fe3eac98338b8fb2b022164aa83664 Mon Sep 17 00:00:00 2001 From: salzcamino Date: Tue, 12 Nov 2019 14:34:53 -0500 Subject: [PATCH 066/149] small description fix --- R/plot_dr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index 76c1c1f5..e546d32b 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -383,7 +383,7 @@ plotDimReduceModule <- #' If NULL, all clusters will be colored. Default NULL. #' @param labelClusters Logical. Whether the cluster labels are plotted. #' Default FALSE. -#' @param group.by Character vector. Contains sample labels for each cell. +#' @param groupBy Character vector. Contains sample labels for each cell. #' If NULL, all samples will be plotted together. Default NULL. #' @param labelSize Numeric. Sets size of label if labelClusters is TRUE. #' Default 3.5. From 5e1c1d25468842a2b328f78a8db22d84d9d22749 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Thu, 14 Nov 2019 00:37:02 -0500 Subject: [PATCH 067/149] update z-assignment probability calculateion in celda_C --- R/celda_C.R | 52 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/R/celda_C.R b/R/celda_C.R index e2a553f2..f4f5a9fc 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -433,35 +433,51 @@ celda_C <- function(counts, ix <- sample(seq(nM)) for (i in ix) { ## Subtract cell counts from current population assignment - nGByCP1 <- nGByCP - nGByCP1[, z[i]] <- nGByCP[, z[i]] - counts[, i] - nGByCP1 <- .colSums(lgamma(nGByCP1 + beta), nrow(nGByCP), ncol(nGByCP)) + #nGByCP1 <- nGByCP + #nGByCP1[, z[i]] <- nGByCP[, z[i]] - counts[, i] + #nGByCP1 <- .colSums(lgamma(nGByCP1 + beta), nrow(nGByCP), ncol(nGByCP)) - nCP1 <- nCP - nCP1[z[i]] <- nCP1[z[i]] - nByC[i] - nCP1 <- lgamma(nCP1 + (nG * beta)) + #nCP1 <- nCP + #nCP1[z[i]] <- nCP1[z[i]] - nByC[i] + #nCP1 <- lgamma(nCP1 + (nG * beta)) ## Add cell counts to all other populations - nGByCP2 <- nGByCP - otherIx <- seq(K)[-z[i]] - nGByCP2[, otherIx] <- nGByCP2[, otherIx] + counts[, i] - nGByCP2 <- .colSums(lgamma(nGByCP2 + beta), nrow(nGByCP), ncol(nGByCP)) + #nGByCP2 <- nGByCP + #otherIx <- seq(K)[-z[i]] + #nGByCP2[, otherIx] <- nGByCP2[, otherIx] + counts[, i] + #nGByCP2 <- .colSums(lgamma(nGByCP2 + beta), nrow(nGByCP), ncol(nGByCP)) - nCP2 <- nCP - nCP2[otherIx] <- nCP2[otherIx] + nByC[i] - nCP2 <- lgamma(nCP2 + (nG * beta)) + #nCP2 <- nCP + #nCP2[otherIx] <- nCP2[otherIx] + nByC[i] + #nCP2 <- lgamma(nCP2 + (nG * beta)) mCPByS[z[i], s[i]] <- mCPByS[z[i], s[i]] - 1L ## Calculate probabilities for each state + ## when consider a specific cluster fo this cell, no need to calculate cells in other cluster for (j in seq_len(K)) { - otherIx <- seq(K)[-j] + #otherIx <- seq(K)[-j] + if (j != z[i]) { # when j is not current population assignment probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) - nGByCP2[j] - ## Phi Numerator (current cell) - sum(nCP1[otherIx]) - ## Phi Denominator (other cells) - nCP2[j] ## Phi Denominator (current cell) + sum(lgamma(nGByCP[j] + counts[, i] + beta)) - # if adding this cell -- Phi Numerator + lgamma(nCP[j] + nByC[i] + nG * beta) - # if adding this cell -- Phi Denominator + sum(lgamma(nGByCP[j] + beta)) + # if without this cell -- Phi Numerator + lgamma(nCP[j] + nG * beta ) # if without this cell -- Phi Denominator + #sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) + #nGByCP2[j] - ## Phi Numerator (current cell) + #sum(nCP1[otherIx]) - ## Phi Denominator (other cells) + #nCP2[j] - ## Phi Denominator (current cell) + } + else { # when j is current population assignment + probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified + sum(lgamma(nGByCP[j] + beta)) - + lgamma(nCP[j] + nG * beta) - + sum(lgamma(nGByCP[j] - counts[, i] + beta)) + + lgamma(nCP[j] - nByC[i] + nG * beta) + + } + } ## Sample next state and add back counts From db64c2d20f1f2adf631d44ff070a99018c975b64 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Sat, 16 Nov 2019 23:46:14 -0500 Subject: [PATCH 068/149] format --- R/celda_C.R | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/R/celda_C.R b/R/celda_C.R index f4f5a9fc..04bde65f 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -455,26 +455,26 @@ celda_C <- function(counts, mCPByS[z[i], s[i]] <- mCPByS[z[i], s[i]] - 1L ## Calculate probabilities for each state - ## when consider a specific cluster fo this cell, no need to calculate cells in other cluster + ## when consider a specific cluster fo this cell, no need to calculate cells in other cluster for (j in seq_len(K)) { #otherIx <- seq(K)[-j] - if (j != z[i]) { # when j is not current population assignment - probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(lgamma(nGByCP[j] + counts[, i] + beta)) - # if adding this cell -- Phi Numerator - lgamma(nCP[j] + nByC[i] + nG * beta) - # if adding this cell -- Phi Denominator - sum(lgamma(nGByCP[j] + beta)) + # if without this cell -- Phi Numerator - lgamma(nCP[j] + nG * beta ) # if without this cell -- Phi Denominator - #sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) - #nGByCP2[j] - ## Phi Numerator (current cell) - #sum(nCP1[otherIx]) - ## Phi Denominator (other cells) - #nCP2[j] - ## Phi Denominator (current cell) + if (j != z[i]) { # when j is not current population assignment + probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified + sum(lgamma(nGByCP[j] + counts[, i] + beta)) - # if adding this cell -- Phi Numerator + lgamma(nCP[j] + nByC[i] + nG * beta) - # if adding this cell -- Phi Denominator + sum(lgamma(nGByCP[j] + beta)) + # if without this cell -- Phi Numerator + lgamma(nCP[j] + nG * beta ) # if without this cell -- Phi Denominator + #sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) + #nGByCP2[j] - ## Phi Numerator (current cell) + #sum(nCP1[otherIx]) - ## Phi Denominator (other cells) + #nCP2[j] - ## Phi Denominator (current cell) } - else { # when j is current population assignment + else { # when j is current population assignment probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(lgamma(nGByCP[j] + beta)) - - lgamma(nCP[j] + nG * beta) - - sum(lgamma(nGByCP[j] - counts[, i] + beta)) + - lgamma(nCP[j] - nByC[i] + nG * beta) + sum(lgamma(nGByCP[j] + beta)) - + lgamma(nCP[j] + nG * beta) - + sum(lgamma(nGByCP[j] - counts[, i] + beta)) + + lgamma(nCP[j] - nByC[i] + nG * beta) } From 04939adaae3fc41b3691d1e8d9a3d4661180b48c Mon Sep 17 00:00:00 2001 From: Irisapo Date: Sun, 17 Nov 2019 00:34:00 -0500 Subject: [PATCH 069/149] index bug --- R/celda_C.R | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/R/celda_C.R b/R/celda_C.R index 04bde65f..6d913041 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -458,25 +458,25 @@ celda_C <- function(counts, ## when consider a specific cluster fo this cell, no need to calculate cells in other cluster for (j in seq_len(K)) { #otherIx <- seq(K)[-j] - if (j != z[i]) { # when j is not current population assignment - probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(lgamma(nGByCP[j] + counts[, i] + beta)) - # if adding this cell -- Phi Numerator - lgamma(nCP[j] + nByC[i] + nG * beta) - # if adding this cell -- Phi Denominator - sum(lgamma(nGByCP[j] + beta)) + # if without this cell -- Phi Numerator - lgamma(nCP[j] + nG * beta ) # if without this cell -- Phi Denominator - #sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) - #nGByCP2[j] - ## Phi Numerator (current cell) - #sum(nCP1[otherIx]) - ## Phi Denominator (other cells) - #nCP2[j] - ## Phi Denominator (current cell) - } - else { # when j is current population assignment - probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(lgamma(nGByCP[j] + beta)) - - lgamma(nCP[j] + nG * beta) - - sum(lgamma(nGByCP[j] - counts[, i] + beta)) + - lgamma(nCP[j] - nByC[i] + nG * beta) - - } + if (j != z[i]) { # when j is not current population assignment + probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified + sum(lgamma(nGByCP[, j] + counts[, i] + beta)) - # if adding this cell -- Phi Numerator + lgamma(nCP[j] + nByC[i] + nG * beta) - # if adding this cell -- Phi Denominator + sum(lgamma(nGByCP[, j] + beta)) + # if without this cell -- Phi Numerator + lgamma(nCP[j] + nG * beta ) # if without this cell -- Phi Denominator + #sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) + #nGByCP2[j] - ## Phi Numerator (current cell) + #sum(nCP1[otherIx]) - ## Phi Denominator (other cells) + #nCP2[j] - ## Phi Denominator (current cell) + } + else { # when j is current population assignment + probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified + sum(lgamma(nGByCP[, j] + beta)) - + lgamma(nCP[j] + nG * beta) - + sum(lgamma(nGByCP[, j] - counts[, i] + beta)) + + lgamma(nCP[j] - nByC[i] + nG * beta) + + } } From 1a4e6615545a3a9c77c7f404bf4dc3bbe3322c96 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Sun, 17 Nov 2019 00:36:17 -0500 Subject: [PATCH 070/149] format --- R/celda_C.R | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/R/celda_C.R b/R/celda_C.R index 6d913041..350696a6 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -455,28 +455,28 @@ celda_C <- function(counts, mCPByS[z[i], s[i]] <- mCPByS[z[i], s[i]] - 1L ## Calculate probabilities for each state - ## when consider a specific cluster fo this cell, no need to calculate cells in other cluster + ## when consider a specific cluster fo this cell, no need to calculate cells in other cluster for (j in seq_len(K)) { - #otherIx <- seq(K)[-j] - if (j != z[i]) { # when j is not current population assignment - probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(lgamma(nGByCP[, j] + counts[, i] + beta)) - # if adding this cell -- Phi Numerator - lgamma(nCP[j] + nByC[i] + nG * beta) - # if adding this cell -- Phi Denominator - sum(lgamma(nGByCP[, j] + beta)) + # if without this cell -- Phi Numerator - lgamma(nCP[j] + nG * beta ) # if without this cell -- Phi Denominator - #sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) - #nGByCP2[j] - ## Phi Numerator (current cell) - #sum(nCP1[otherIx]) - ## Phi Denominator (other cells) - #nCP2[j] - ## Phi Denominator (current cell) - } - else { # when j is current population assignment - probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(lgamma(nGByCP[, j] + beta)) - - lgamma(nCP[j] + nG * beta) - - sum(lgamma(nGByCP[, j] - counts[, i] + beta)) + - lgamma(nCP[j] - nByC[i] + nG * beta) - - } + #otherIx <- seq(K)[-j] + if (j != z[i]) { # when j is not current population assignment + probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified + sum(lgamma(nGByCP[, j] + counts[, i] + beta)) - # if adding this cell -- Phi Numerator + lgamma(nCP[j] + nByC[i] + nG * beta) - # if adding this cell -- Phi Denominator + sum(lgamma(nGByCP[, j] + beta)) + # if without this cell -- Phi Numerator + lgamma(nCP[j] + nG * beta ) # if without this cell -- Phi Denominator + #sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) + #nGByCP2[j] - ## Phi Numerator (current cell) + #sum(nCP1[otherIx]) - ## Phi Denominator (other cells) + #nCP2[j] - ## Phi Denominator (current cell) + } + else { # when j is current population assignment + probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified + sum(lgamma(nGByCP[, j] + beta)) - + lgamma(nCP[j] + nG * beta) - + sum(lgamma(nGByCP[, j] - counts[, i] + beta)) + + lgamma(nCP[j] - nByC[i] + nG * beta) + + } } From 96fafbc16615368c1f077d2e66dc50d375fdaddb Mon Sep 17 00:00:00 2001 From: Irisapo Date: Sun, 17 Nov 2019 01:09:43 -0500 Subject: [PATCH 071/149] format........ --- R/celda_C.R | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/R/celda_C.R b/R/celda_C.R index 350696a6..37878ed0 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -455,28 +455,27 @@ celda_C <- function(counts, mCPByS[z[i], s[i]] <- mCPByS[z[i], s[i]] - 1L ## Calculate probabilities for each state - ## when consider a specific cluster fo this cell, no need to calculate cells in other cluster + ## when consider a specific cluster fo this cell, no need to calculate cells in other cluster for (j in seq_len(K)) { - #otherIx <- seq(K)[-j] - if (j != z[i]) { # when j is not current population assignment - probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(lgamma(nGByCP[, j] + counts[, i] + beta)) - # if adding this cell -- Phi Numerator - lgamma(nCP[j] + nByC[i] + nG * beta) - # if adding this cell -- Phi Denominator - sum(lgamma(nGByCP[, j] + beta)) + # if without this cell -- Phi Numerator - lgamma(nCP[j] + nG * beta ) # if without this cell -- Phi Denominator - #sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) - #nGByCP2[j] - ## Phi Numerator (current cell) - #sum(nCP1[otherIx]) - ## Phi Denominator (other cells) - #nCP2[j] - ## Phi Denominator (current cell) - } - else { # when j is current population assignment - probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(lgamma(nGByCP[, j] + beta)) - - lgamma(nCP[j] + nG * beta) - - sum(lgamma(nGByCP[, j] - counts[, i] + beta)) + - lgamma(nCP[j] - nByC[i] + nG * beta) - - } + #otherIx <- seq(K)[-j] + if (j != z[i]) { # when j is not current population assignment + probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified + sum(lgamma(nGByCP[, j] + counts[, i] + beta)) - # if adding this cell -- Phi Numerator + lgamma(nCP[j] + nByC[i] + nG * beta) - # if adding this cell -- Phi Denominator + sum(lgamma(nGByCP[, j] + beta)) + # if without this cell -- Phi Numerator + lgamma(nCP[j] + nG * beta ) # if without this cell -- Phi Denominator + #sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) + #nGByCP2[j] - ## Phi Numerator (current cell) + #sum(nCP1[otherIx]) - ## Phi Denominator (other cells) + #nCP2[j] - ## Phi Denominator (current cell) + } else { # when j is current population assignment + probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified + sum(lgamma(nGByCP[, j] + beta)) - + lgamma(nCP[j] + nG * beta) - + sum(lgamma(nGByCP[, j] - counts[, i] + beta)) + + lgamma(nCP[j] - nByC[i] + nG * beta) + + } } From b3f4ee1c9747684d55224f48c4a4efc481b02094 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Wed, 20 Nov 2019 10:31:16 -0500 Subject: [PATCH 072/149] format.. --- R/celda_C.R | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/R/celda_C.R b/R/celda_C.R index 37878ed0..b6f4bfbf 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -455,25 +455,26 @@ celda_C <- function(counts, mCPByS[z[i], s[i]] <- mCPByS[z[i], s[i]] - 1L ## Calculate probabilities for each state - ## when consider a specific cluster fo this cell, no need to calculate cells in other cluster + ## when consider a specific cluster fo this cell, + ## no need to calculate cells in other cluster for (j in seq_len(K)) { #otherIx <- seq(K)[-j] if (j != z[i]) { # when j is not current population assignment probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(lgamma(nGByCP[, j] + counts[, i] + beta)) - # if adding this cell -- Phi Numerator - lgamma(nCP[j] + nByC[i] + nG * beta) - # if adding this cell -- Phi Denominator - sum(lgamma(nGByCP[, j] + beta)) + # if without this cell -- Phi Numerator - lgamma(nCP[j] + nG * beta ) # if without this cell -- Phi Denominator + sum(lgamma(nGByCP[, j] + counts[, i] + beta)) - # if adding this cell -- Phi Numerator + lgamma(nCP[j] + nByC[i] + nG * beta) - # if adding this cell -- Phi Denominator + sum(lgamma(nGByCP[, j] + beta)) + # if without this cell -- Phi Numerator + lgamma(nCP[j] + nG * beta) # if without this cell -- Phi Denominator #sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) #nGByCP2[j] - ## Phi Numerator (current cell) #sum(nCP1[otherIx]) - ## Phi Denominator (other cells) #nCP2[j] - ## Phi Denominator (current cell) - } else { # when j is current population assignment + } else { # when j is current population assignment probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(lgamma(nGByCP[, j] + beta)) - + sum(lgamma(nGByCP[, j] + beta)) - lgamma(nCP[j] + nG * beta) - sum(lgamma(nGByCP[, j] - counts[, i] + beta)) + - lgamma(nCP[j] - nByC[i] + nG * beta) + lgamma(nCP[j] - nByC[i] + nG * beta) } From ea52f3c512c6c96401bc52c25c0651e3dd47dd47 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Wed, 20 Nov 2019 16:31:23 -0500 Subject: [PATCH 073/149] update probability calculation of y for celda_G #213 --- R/RcppExports.R | 4 +++ R/celda_G.R | 2 +- man/plotDimReduceCluster.Rd | 6 +++- man/plotDimReduceFeature.Rd | 2 +- man/plotDimReduceGrid.Rd | 2 +- man/plotDimReduceModule.Rd | 2 +- man/plotHeatmap.Rd | 8 ++--- man/semiPheatmap.Rd | 3 +- src/RcppExports.cpp | 24 +++++++++++++++ src/cG_calcGibbsProbY.cpp | 61 +++++++++++++++++++++++++++++++++++++ 10 files changed, 104 insertions(+), 10 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 0a461013..aef40203 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -9,6 +9,10 @@ cG_CalcGibbsProbY <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, .Call('_celda_cG_CalcGibbsProbY', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) } +cG_CalcGibbsProbY_fast <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { + .Call('_celda_cG_CalcGibbsProbY_fast', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) +} + #' Fast matrix multiplication for double x int #' #' @param A a double matrix diff --git a/R/celda_G.R b/R/celda_G.R index 5f2b566a..b6453b58 100755 --- a/R/celda_G.R +++ b/R/celda_G.R @@ -417,7 +417,7 @@ celda_G <- function(counts, probs <- matrix(NA, ncol = nG, nrow = L) ix <- sample(seq(nG)) for (i in ix) { - probs[, i] <- cG_CalcGibbsProbY( + probs[, i] <- cG_CalcGibbsProbY_fast( index = i, counts = counts, nTSbyC = nTSByC, diff --git a/man/plotDimReduceCluster.Rd b/man/plotDimReduceCluster.Rd index d9756f53..a6e9dd11 100644 --- a/man/plotDimReduceCluster.Rd +++ b/man/plotDimReduceCluster.Rd @@ -6,7 +6,8 @@ \usage{ plotDimReduceCluster(dim1, dim2, cluster, size = 1, xlab = "Dimension_1", ylab = "Dimension_2", - specificClusters = NULL, labelClusters = FALSE, labelSize = 3.5) + specificClusters = NULL, labelClusters = FALSE, groupBy = NULL, + labelSize = 3.5) } \arguments{ \item{dim1}{Numeric vector. First dimension from data @@ -31,6 +32,9 @@ If NULL, all clusters will be colored. Default NULL.} \item{labelClusters}{Logical. Whether the cluster labels are plotted. Default FALSE.} +\item{groupBy}{Character vector. Contains sample labels for each cell. +If NULL, all samples will be plotted together. Default NULL.} + \item{labelSize}{Numeric. Sets size of label if labelClusters is TRUE. Default 3.5.} } diff --git a/man/plotDimReduceFeature.Rd b/man/plotDimReduceFeature.Rd index 93065ad6..6f46fc23 100644 --- a/man/plotDimReduceFeature.Rd +++ b/man/plotDimReduceFeature.Rd @@ -7,7 +7,7 @@ plotDimReduceFeature(dim1, dim2, counts, features, headers = NULL, normalize = TRUE, exactMatch = TRUE, trim = c(-2, 2), size = 1, xlab = "Dimension_1", ylab = "Dimension_2", colorLow = "blue", - colorMid = "white", colorHigh = "red") + colorMid = "white", colorHigh = "red", ncol = NULL) } \arguments{ \item{dim1}{Numeric vector. First dimension from data diff --git a/man/plotDimReduceGrid.Rd b/man/plotDimReduceGrid.Rd index 1c0b1f3d..fd0fc390 100644 --- a/man/plotDimReduceGrid.Rd +++ b/man/plotDimReduceGrid.Rd @@ -5,7 +5,7 @@ \title{Mapping the dimensionality reduction plot} \usage{ plotDimReduceGrid(dim1, dim2, matrix, size, xlab, ylab, colorLow, colorMid, - colorHigh, varLabel, headers = NULL) + colorHigh, varLabel, ncol = NULL, headers = NULL) } \arguments{ \item{dim1}{Numeric vector. First dimension from data dimensionality diff --git a/man/plotDimReduceModule.Rd b/man/plotDimReduceModule.Rd index f1082752..0373bb13 100644 --- a/man/plotDimReduceModule.Rd +++ b/man/plotDimReduceModule.Rd @@ -8,7 +8,7 @@ plotDimReduceModule(dim1, dim2, counts, celdaMod, modules = NULL, rescale = TRUE, size = 1, xlab = "Dimension_1", ylab = "Dimension_2", colorLow = "grey", colorMid = NULL, - colorHigh = "blue") + colorHigh = "blue", ncol = NULL) } \arguments{ \item{dim1}{Numeric vector. diff --git a/man/plotHeatmap.Rd b/man/plotHeatmap.Rd index 9cdefd74..387e6191 100644 --- a/man/plotHeatmap.Rd +++ b/man/plotHeatmap.Rd @@ -4,10 +4,10 @@ \alias{plotHeatmap} \title{Plots heatmap based on Celda model} \usage{ -plotHeatmap(counts, z = NULL, y = NULL, scaleRow = scale, - trim = c(-2, 2), featureIx = NULL, cellIx = NULL, - clusterFeature = TRUE, clusterCell = TRUE, - colorScheme = c("divergent", "sequential"), +plotHeatmap(counts, z = NULL, y = NULL, rowGroupOrder = NULL, + colGroupOrder = NULL, scaleRow = scale, trim = c(-2, 2), + featureIx = NULL, cellIx = NULL, clusterFeature = TRUE, + clusterCell = TRUE, colorScheme = c("divergent", "sequential"), colorSchemeSymmetric = TRUE, colorSchemeCenter = 0, col = NULL, annotationCell = NULL, annotationFeature = NULL, annotationColor = NULL, breaks = NULL, legend = TRUE, diff --git a/man/semiPheatmap.Rd b/man/semiPheatmap.Rd index c2b23c42..dcfb9d3f 100644 --- a/man/semiPheatmap.Rd +++ b/man/semiPheatmap.Rd @@ -23,7 +23,8 @@ semiPheatmap(mat, color = colorRampPalette(rev(brewer.pal(n = 7, name = numberColor = "grey30", fontSizeNumber = 0.8 * fontSize, gapsRow = NULL, gapsCol = NULL, labelsRow = NULL, labelsCol = NULL, fileName = NA, width = NA, height = NA, - silent = FALSE, rowLabel, colLabel, ...) + silent = FALSE, rowLabel, colLabel, rowGroupOrder = NULL, + colGroupOrder = NULL, ...) } \arguments{ \item{mat}{numeric matrix of the values to be plotted.} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 37a8a6b7..fb6c829e 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -50,6 +50,29 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// cG_CalcGibbsProbY_fast +NumericVector cG_CalcGibbsProbY_fast(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); +RcppExport SEXP _celda_cG_CalcGibbsProbY_fast(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const int >::type index(indexSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix& >::type nTSbyC(nTSbyCSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nbyTS(nbyTSSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nGbyTS(nGbyTSSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nbyG(nbyGSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type y(ySEXP); + Rcpp::traits::input_parameter< const int >::type L(LSEXP); + Rcpp::traits::input_parameter< const int >::type nG(nGSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_beta(lg_betaSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_gamma(lg_gammaSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_delta(lg_deltaSEXP); + Rcpp::traits::input_parameter< const double >::type delta(deltaSEXP); + rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY_fast(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); + return rcpp_result_gen; +END_RCPP +} // eigenMatMultInt SEXP eigenMatMultInt(const Eigen::Map A, const Eigen::Map< Eigen::MatrixXi> B); RcppExport SEXP _celda_eigenMatMultInt(SEXP ASEXP, SEXP BSEXP) { @@ -110,6 +133,7 @@ RcppExport SEXP _rowSumByGroupChange(SEXP, SEXP, SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { {"_celda_cG_calcGibbsProbY_Simple", (DL_FUNC) &_celda_cG_calcGibbsProbY_Simple, 11}, {"_celda_cG_CalcGibbsProbY", (DL_FUNC) &_celda_cG_CalcGibbsProbY, 13}, + {"_celda_cG_CalcGibbsProbY_fast", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fast, 13}, {"_celda_eigenMatMultInt", (DL_FUNC) &_celda_eigenMatMultInt, 2}, {"_celda_fastNormProp", (DL_FUNC) &_celda_fastNormProp, 2}, {"_celda_fastNormPropLog", (DL_FUNC) &_celda_fastNormPropLog, 2}, diff --git a/src/cG_calcGibbsProbY.cpp b/src/cG_calcGibbsProbY.cpp index 904bd502..939ab44a 100644 --- a/src/cG_calcGibbsProbY.cpp +++ b/src/cG_calcGibbsProbY.cpp @@ -120,3 +120,64 @@ NumericVector cG_CalcGibbsProbY(const int index, return(probs); } + +// [[Rcpp::export]] +NumericVector cG_CalcGibbsProbY_fast(const int index, + const IntegerMatrix& counts, + const IntegerMatrix& nTSbyC, + const IntegerVector& nbyTS, + const IntegerVector& nGbyTS, + const IntegerVector& nbyG, + const IntegerVector& y, + const int L, + const int nG, + const NumericVector& lg_beta, + const NumericVector& lg_gamma, + const NumericVector& lg_delta, + const double delta) { + + int index0 = index - 1; + int current_y = y[index0] - 1; + int i; + + NumericVector probs(L); + + // Calculate probabilities related to the "n.TS.by.C" part of equation one time up front + // The first case of if statement represents when the current feature is already added to that module + // The second case represents when the current feature is NOT YET added to that module + for (i = 0; i < L; i++) { + if (i == current_y ) { + for (int col = 0; col < counts.ncol(); col++) { + probs[i] += lg_beta[nTSbyC(i, col)]; + probs[i] -= lg_beta[nTSbyC(i, col) - counts(index0, col)]; + } + } else { + for (int col = 0; col < counts.ncol(); col++) { + probs[i] += lg_beta[nTSbyC(i, col) + counts(index0, col)]; + probs[i] -= lg_beta[nTSbyC(i, col)]; + } + } + } + + // Calculate the probabilities for each module + // If statements determine whether to add or subtract counts from each probability + for (i = 0; i < L; i++) { + if (i == current_y) { + probs[i] += lg_gamma[nGbyTS[i]]; + probs[i] -= lg_gamma[nGbyTS[i] - 1]; + probs[i] += lg_delta[nGbyTS[i]]; + probs[i] -= lg_delta[nGbyTS[i] - 1]; + probs[i] += lgamma(nbyTS[i] - nbyG[index0] + (nGbyTS[i] - 1) * delta); + probs[i] -= lgamma(nbyTS[i] + nGbyTS[i] * delta); + } else { + probs[i] += lg_gamma[nGbyTS[i] + 1]; + probs[i] -= lg_gamma[nGbyTS[i]]; + probs[i] += lg_delta[nGbyTS[i] + 1]; + probs[i] -= lg_delta[nGbyTS[i]]; + probs[i] += lgamma(nbyTS[i] + nGbyTS[i] * delta); + probs[i] -= lgamma(nbyTS[i] + nbyG[index0] + (nGbyTS[i] + 1) * delta); + } + } + + return(probs); +} From d5a7cefeca3087c5b2235d58a14dbd7bf99b4b98 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Tue, 26 Nov 2019 21:20:00 -0500 Subject: [PATCH 074/149] column first loop in Rcpp y probability calculation to speed up --- R/RcppExports.R | 8 +++ src/RcppExports.cpp | 48 ++++++++++++++ src/cG_calcGibbsProbY.cpp | 131 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 187 insertions(+) diff --git a/R/RcppExports.R b/R/RcppExports.R index aef40203..ccb7599f 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -13,6 +13,14 @@ cG_CalcGibbsProbY_fast <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y .Call('_celda_cG_CalcGibbsProbY_fast', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) } +cG_CalcGibbsProbY_fastVector <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { + .Call('_celda_cG_CalcGibbsProbY_fastVector', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) +} + +cG_CalcGibbsProbY_fastVectorFlip <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { + .Call('_celda_cG_CalcGibbsProbY_fastVectorFlip', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) +} + #' Fast matrix multiplication for double x int #' #' @param A a double matrix diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index fb6c829e..e1b69e43 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -73,6 +73,52 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// cG_CalcGibbsProbY_fastVector +NumericVector cG_CalcGibbsProbY_fastVector(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); +RcppExport SEXP _celda_cG_CalcGibbsProbY_fastVector(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const int >::type index(indexSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix& >::type nTSbyC(nTSbyCSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nbyTS(nbyTSSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nGbyTS(nGbyTSSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nbyG(nbyGSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type y(ySEXP); + Rcpp::traits::input_parameter< const int >::type L(LSEXP); + Rcpp::traits::input_parameter< const int >::type nG(nGSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_beta(lg_betaSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_gamma(lg_gammaSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_delta(lg_deltaSEXP); + Rcpp::traits::input_parameter< const double >::type delta(deltaSEXP); + rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY_fastVector(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); + return rcpp_result_gen; +END_RCPP +} +// cG_CalcGibbsProbY_fastVectorFlip +NumericVector cG_CalcGibbsProbY_fastVectorFlip(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); +RcppExport SEXP _celda_cG_CalcGibbsProbY_fastVectorFlip(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const int >::type index(indexSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix& >::type nTSbyC(nTSbyCSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nbyTS(nbyTSSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nGbyTS(nGbyTSSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nbyG(nbyGSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type y(ySEXP); + Rcpp::traits::input_parameter< const int >::type L(LSEXP); + Rcpp::traits::input_parameter< const int >::type nG(nGSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_beta(lg_betaSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_gamma(lg_gammaSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_delta(lg_deltaSEXP); + Rcpp::traits::input_parameter< const double >::type delta(deltaSEXP); + rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY_fastVectorFlip(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); + return rcpp_result_gen; +END_RCPP +} // eigenMatMultInt SEXP eigenMatMultInt(const Eigen::Map A, const Eigen::Map< Eigen::MatrixXi> B); RcppExport SEXP _celda_eigenMatMultInt(SEXP ASEXP, SEXP BSEXP) { @@ -134,6 +180,8 @@ static const R_CallMethodDef CallEntries[] = { {"_celda_cG_calcGibbsProbY_Simple", (DL_FUNC) &_celda_cG_calcGibbsProbY_Simple, 11}, {"_celda_cG_CalcGibbsProbY", (DL_FUNC) &_celda_cG_CalcGibbsProbY, 13}, {"_celda_cG_CalcGibbsProbY_fast", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fast, 13}, + {"_celda_cG_CalcGibbsProbY_fastVector", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fastVector, 13}, + {"_celda_cG_CalcGibbsProbY_fastVectorFlip", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fastVectorFlip, 13}, {"_celda_eigenMatMultInt", (DL_FUNC) &_celda_eigenMatMultInt, 2}, {"_celda_fastNormProp", (DL_FUNC) &_celda_fastNormProp, 2}, {"_celda_fastNormPropLog", (DL_FUNC) &_celda_fastNormPropLog, 2}, diff --git a/src/cG_calcGibbsProbY.cpp b/src/cG_calcGibbsProbY.cpp index 939ab44a..402e91c5 100644 --- a/src/cG_calcGibbsProbY.cpp +++ b/src/cG_calcGibbsProbY.cpp @@ -181,3 +181,134 @@ NumericVector cG_CalcGibbsProbY_fast(const int index, return(probs); } + + +// [[Rcpp::export]] +NumericVector cG_CalcGibbsProbY_fastVector(const int index, + const IntegerMatrix& counts, + const IntegerMatrix& nTSbyC, + const IntegerVector& nbyTS, + const IntegerVector& nGbyTS, + const IntegerVector& nbyG, + const IntegerVector& y, + const int L, + const int nG, + const NumericVector& lg_beta, + const NumericVector& lg_gamma, + const NumericVector& lg_delta, + const double delta) { + + int index0 = index - 1; + int current_y = y[index0] - 1; + int i; + int j,k; + + NumericVector probs(L); + + // Calculate probabilities related to the "n.TS.by.C" part of equation one time up front + // The first case of if statement represents when the current feature is already added to that module + // The second case represents when the current feature is NOT YET added to that module + for (i = 0; i < L; i++) { + if (i == current_y ) { + for (int col = 0; col < counts.ncol(); col++) { + j = col * L + i; // Index for the current module in the n.TS.by.C matrix + k = col * nG + index0; // Index for the current feature in counts matrix + probs[i] += lg_beta[nTSbyC[j]]; + probs[i] -= lg_beta[nTSbyC[j] - counts[k]]; + } + } else { + for (int col = 0; col < counts.ncol(); col++) { + j = col * L + i; + k = col * nG + index0; + probs[i] += lg_beta[nTSbyC[j] + counts[k]]; + probs[i] -= lg_beta[nTSbyC[j]]; + } + } + } + + // Calculate the probabilities for each module + // If statements determine whether to add or subtract counts from each probability + for (i = 0; i < L; i++) { + if (i == current_y) { + probs[i] += lg_gamma[nGbyTS[i]]; + probs[i] -= lg_gamma[nGbyTS[i] - 1]; + probs[i] += lg_delta[nGbyTS[i]]; + probs[i] -= lg_delta[nGbyTS[i] - 1]; + probs[i] += lgamma(nbyTS[i] - nbyG[index0] + (nGbyTS[i] - 1) * delta); + probs[i] -= lgamma(nbyTS[i] + nGbyTS[i] * delta); + } else { + probs[i] += lg_gamma[nGbyTS[i] + 1]; + probs[i] -= lg_gamma[nGbyTS[i]]; + probs[i] += lg_delta[nGbyTS[i] + 1]; + probs[i] -= lg_delta[nGbyTS[i]]; + probs[i] += lgamma(nbyTS[i] + nGbyTS[i] * delta); + probs[i] -= lgamma(nbyTS[i] + nbyG[index0] + (nGbyTS[i] + 1) * delta); + } + } + + return(probs); +} + + +// [[Rcpp::export]] +NumericVector cG_CalcGibbsProbY_fastVectorFlip(const int index, + const IntegerMatrix& counts, + const IntegerMatrix& nTSbyC, + const IntegerVector& nbyTS, + const IntegerVector& nGbyTS, + const IntegerVector& nbyG, + const IntegerVector& y, + const int L, + const int nG, + const NumericVector& lg_beta, + const NumericVector& lg_gamma, + const NumericVector& lg_delta, + const double delta) { + + int index0 = index - 1; + int current_y = y[index0] - 1; + int i; + int j,k; + + NumericVector probs(L); + + // Calculate probabilities related to the "n.TS.by.C" part of equation one time up front + // The first case of if statement represents when the current feature is already added to that module + // The second case represents when the current feature is NOT YET added to that module + for (int col = 0; col < counts.ncol(); col++) { + k = col * nG + index0; // Index for the current feature in counts matrix + for (i = 0; i < L; i++) { + j = col * L + i; // Index for the current module in the n.TS.by.C matrix + if (i == current_y) { + probs[i] += lg_beta[nTSbyC[j]]; + probs[i] -= lg_beta[nTSbyC[j] - counts[k]]; + } else { + probs[i] += lg_beta[nTSbyC[j] + counts[k]]; + probs[i] -= lg_beta[nTSbyC[j]]; + } + } + } + + + // Calculate the probabilities for each module + // If statements determine whether to add or subtract counts from each probability + for (i = 0; i < L; i++) { + if (i == current_y) { + probs[i] += lg_gamma[nGbyTS[i]]; + probs[i] -= lg_gamma[nGbyTS[i] - 1]; + probs[i] += lg_delta[nGbyTS[i]]; + probs[i] -= lg_delta[nGbyTS[i] - 1]; + probs[i] += lgamma(nbyTS[i] - nbyG[index0] + (nGbyTS[i] - 1) * delta); + probs[i] -= lgamma(nbyTS[i] + nGbyTS[i] * delta); + } else { + probs[i] += lg_gamma[nGbyTS[i] + 1]; + probs[i] -= lg_gamma[nGbyTS[i]]; + probs[i] += lg_delta[nGbyTS[i] + 1]; + probs[i] -= lg_delta[nGbyTS[i]]; + probs[i] += lgamma(nbyTS[i] + nGbyTS[i] * delta); + probs[i] -= lgamma(nbyTS[i] + nbyG[index0] + (nGbyTS[i] + 1) * delta); + } + } + + return(probs); +} From fe25283dd35237fd500ad420386cfa8204cfb1bc Mon Sep 17 00:00:00 2001 From: Irisapo Date: Wed, 4 Dec 2019 16:06:12 -0500 Subject: [PATCH 075/149] empty rows decontX --- R/decon.R | 8 ++++++-- man/plotDimReduceCluster.Rd | 6 +++++- man/plotDimReduceFeature.Rd | 2 +- man/plotDimReduceGrid.Rd | 2 +- man/plotDimReduceModule.Rd | 2 +- man/plotHeatmap.Rd | 8 ++++---- man/semiPheatmap.Rd | 3 ++- 7 files changed, 20 insertions(+), 11 deletions(-) diff --git a/R/decon.R b/R/decon.R index 134d6d12..01cdddb6 100644 --- a/R/decon.R +++ b/R/decon.R @@ -288,8 +288,12 @@ decontX <- function(counts, verbose = TRUE, seed = 12345) { + res = matrix(0, ncol = ncol(counts), nrow = nrow(counts), + dimnames = list(rownames(counts), colnames(counts))) + noneEmpGeneIndex = rowSums(counts) != 0 + counts = counts[noneEmpGeneIndex, ] if (is.null(seed)) { - res <- .decontX(counts = counts, + res[noneEmpGeneIndex, ] <- .decontX(counts = counts, z = z, batch = batch, maxIter = maxIter, @@ -298,7 +302,7 @@ decontX <- function(counts, verbose = verbose) } else { with_seed(seed, - res <- .decontX(counts = counts, + res[noneEmpGeneIndex, ] <- .decontX(counts = counts, z = z, batch = batch, maxIter = maxIter, diff --git a/man/plotDimReduceCluster.Rd b/man/plotDimReduceCluster.Rd index d9756f53..a6e9dd11 100644 --- a/man/plotDimReduceCluster.Rd +++ b/man/plotDimReduceCluster.Rd @@ -6,7 +6,8 @@ \usage{ plotDimReduceCluster(dim1, dim2, cluster, size = 1, xlab = "Dimension_1", ylab = "Dimension_2", - specificClusters = NULL, labelClusters = FALSE, labelSize = 3.5) + specificClusters = NULL, labelClusters = FALSE, groupBy = NULL, + labelSize = 3.5) } \arguments{ \item{dim1}{Numeric vector. First dimension from data @@ -31,6 +32,9 @@ If NULL, all clusters will be colored. Default NULL.} \item{labelClusters}{Logical. Whether the cluster labels are plotted. Default FALSE.} +\item{groupBy}{Character vector. Contains sample labels for each cell. +If NULL, all samples will be plotted together. Default NULL.} + \item{labelSize}{Numeric. Sets size of label if labelClusters is TRUE. Default 3.5.} } diff --git a/man/plotDimReduceFeature.Rd b/man/plotDimReduceFeature.Rd index 93065ad6..6f46fc23 100644 --- a/man/plotDimReduceFeature.Rd +++ b/man/plotDimReduceFeature.Rd @@ -7,7 +7,7 @@ plotDimReduceFeature(dim1, dim2, counts, features, headers = NULL, normalize = TRUE, exactMatch = TRUE, trim = c(-2, 2), size = 1, xlab = "Dimension_1", ylab = "Dimension_2", colorLow = "blue", - colorMid = "white", colorHigh = "red") + colorMid = "white", colorHigh = "red", ncol = NULL) } \arguments{ \item{dim1}{Numeric vector. First dimension from data diff --git a/man/plotDimReduceGrid.Rd b/man/plotDimReduceGrid.Rd index 1c0b1f3d..fd0fc390 100644 --- a/man/plotDimReduceGrid.Rd +++ b/man/plotDimReduceGrid.Rd @@ -5,7 +5,7 @@ \title{Mapping the dimensionality reduction plot} \usage{ plotDimReduceGrid(dim1, dim2, matrix, size, xlab, ylab, colorLow, colorMid, - colorHigh, varLabel, headers = NULL) + colorHigh, varLabel, ncol = NULL, headers = NULL) } \arguments{ \item{dim1}{Numeric vector. First dimension from data dimensionality diff --git a/man/plotDimReduceModule.Rd b/man/plotDimReduceModule.Rd index f1082752..0373bb13 100644 --- a/man/plotDimReduceModule.Rd +++ b/man/plotDimReduceModule.Rd @@ -8,7 +8,7 @@ plotDimReduceModule(dim1, dim2, counts, celdaMod, modules = NULL, rescale = TRUE, size = 1, xlab = "Dimension_1", ylab = "Dimension_2", colorLow = "grey", colorMid = NULL, - colorHigh = "blue") + colorHigh = "blue", ncol = NULL) } \arguments{ \item{dim1}{Numeric vector. diff --git a/man/plotHeatmap.Rd b/man/plotHeatmap.Rd index 9cdefd74..387e6191 100644 --- a/man/plotHeatmap.Rd +++ b/man/plotHeatmap.Rd @@ -4,10 +4,10 @@ \alias{plotHeatmap} \title{Plots heatmap based on Celda model} \usage{ -plotHeatmap(counts, z = NULL, y = NULL, scaleRow = scale, - trim = c(-2, 2), featureIx = NULL, cellIx = NULL, - clusterFeature = TRUE, clusterCell = TRUE, - colorScheme = c("divergent", "sequential"), +plotHeatmap(counts, z = NULL, y = NULL, rowGroupOrder = NULL, + colGroupOrder = NULL, scaleRow = scale, trim = c(-2, 2), + featureIx = NULL, cellIx = NULL, clusterFeature = TRUE, + clusterCell = TRUE, colorScheme = c("divergent", "sequential"), colorSchemeSymmetric = TRUE, colorSchemeCenter = 0, col = NULL, annotationCell = NULL, annotationFeature = NULL, annotationColor = NULL, breaks = NULL, legend = TRUE, diff --git a/man/semiPheatmap.Rd b/man/semiPheatmap.Rd index c2b23c42..dcfb9d3f 100644 --- a/man/semiPheatmap.Rd +++ b/man/semiPheatmap.Rd @@ -23,7 +23,8 @@ semiPheatmap(mat, color = colorRampPalette(rev(brewer.pal(n = 7, name = numberColor = "grey30", fontSizeNumber = 0.8 * fontSize, gapsRow = NULL, gapsCol = NULL, labelsRow = NULL, labelsCol = NULL, fileName = NA, width = NA, height = NA, - silent = FALSE, rowLabel, colLabel, ...) + silent = FALSE, rowLabel, colLabel, rowGroupOrder = NULL, + colGroupOrder = NULL, ...) } \arguments{ \item{mat}{numeric matrix of the values to be plotted.} From 316be070a14eae6211a6408ffaf886f6f00ba0a8 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Wed, 4 Dec 2019 17:26:51 -0500 Subject: [PATCH 076/149] empty genes in decontx wont be used during estimation --- R/decon.R | 61 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/R/decon.R b/R/decon.R index 01cdddb6..a14c5f5d 100644 --- a/R/decon.R +++ b/R/decon.R @@ -288,12 +288,8 @@ decontX <- function(counts, verbose = TRUE, seed = 12345) { - res = matrix(0, ncol = ncol(counts), nrow = nrow(counts), - dimnames = list(rownames(counts), colnames(counts))) - noneEmpGeneIndex = rowSums(counts) != 0 - counts = counts[noneEmpGeneIndex, ] if (is.null(seed)) { - res[noneEmpGeneIndex, ] <- .decontX(counts = counts, + res <- .decontX(counts = counts, z = z, batch = batch, maxIter = maxIter, @@ -302,7 +298,7 @@ decontX <- function(counts, verbose = verbose) } else { with_seed(seed, - res[noneEmpGeneIndex, ] <- .decontX(counts = counts, + res <- .decontX(counts = counts, z = z, batch = batch, maxIter = maxIter, @@ -322,14 +318,24 @@ decontX <- function(counts, delta = 10, logfile = NULL, verbose = TRUE) { + # empty expression genes won't be used for estimation + haveEmptyGenes = FALSE + totalGenes = nrow(counts) + noneEmptyGeneIndex = rowSums(counts) != 0 + if (sum(noneEmptyGeneIndex) != totalGenes) { + geneNames = rownames(counts) + counts = counts[noneEmptyGeneIndex, ] + haveEmptyGenes = TRUE + } + if (!is.null(batch)) { ## Set result lists upfront for all cells from different batches logLikelihood <- c() estRmat <- matrix( - NA, + 0, ncol = ncol(counts), - nrow = nrow(counts), - dimnames = list(rownames(counts), colnames(counts)) + nrow = totalGenes, + dimnames = list(geneNames, colnames(counts)) ) theta <- rep(NA, ncol(counts)) estConp <- rep(NA, ncol(counts)) @@ -353,8 +359,13 @@ decontX <- function(counts, verbose = verbose ) - estRmat[, batch == bat] <- - resBat$resList$estNativeCounts + if (haveEmptyGenes) { + estRmat[noneEmptyGeneIndex, batch == bat] <- + resBat$resList$estNativeCounts + } else { + estRmat[, batch == bat] <- + resBat$resList$estNativeCounts + } estConp[batch == bat] <- resBat$resList$estConp theta[batch == bat] <- resBat$resList$theta @@ -382,16 +393,23 @@ decontX <- function(counts, )) } - return( - .decontXoneBatch( - counts = counts, - z = z, - maxIter = maxIter, - delta = delta, - logfile = logfile, - verbose = verbose - ) + # when there is only one batch + resultsOneBatch = .decontXoneBatch( + counts = counts, + z = z, + maxIter = maxIter, + delta = delta, + logfile = logfile, + verbose = verbose ) + if (haveEmptyGenes) { + resBat = matrix(0, nrow = totalGenes, ncol = ncol(counts), + dimnames = list(geneNames, colnames(counts))) + resBat[noneEmptyGeneIndex, ] = resultsOneBatch$resList$estNativeCounts + resultsOneBatch$resList$estNativeCounts = resBat + } + return(resultsOneBatch) + } @@ -662,6 +680,9 @@ decontX <- function(counts, if (sum(is.na(counts)) > 0) { stop("Missing value in 'counts' matrix.") } + if (nrow(counts) < 2) { + stop("At least 2 genes need to have non-zero expressions.") + } } From c95cfb9c3e307f9de9fb0416907b6eb52ab3c0da Mon Sep 17 00:00:00 2001 From: Irisapo Date: Wed, 4 Dec 2019 17:32:26 -0500 Subject: [PATCH 077/149] at least 2 genes need to be non-0 expression matrix check for decontX --- R/decon.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/decon.R b/R/decon.R index a14c5f5d..58431383 100644 --- a/R/decon.R +++ b/R/decon.R @@ -680,7 +680,7 @@ decontX <- function(counts, if (sum(is.na(counts)) > 0) { stop("Missing value in 'counts' matrix.") } - if (nrow(counts) < 2) { + if (is.null(dim(counts))) { stop("At least 2 genes need to have non-zero expressions.") } } From cd90b411f4692b9efdfe006777b363a74ce7d194 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Wed, 4 Dec 2019 18:03:56 -0500 Subject: [PATCH 078/149] column first loop in cpp for y probability calculation fasterrrrr --- R/RcppExports.R | 12 +++---- src/RcppExports.cpp | 42 +++++------------------ src/cG_calcGibbsProbY.cpp | 71 ++------------------------------------- 3 files changed, 15 insertions(+), 110 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index ccb7599f..7f1b19a2 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -9,16 +9,12 @@ cG_CalcGibbsProbY <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, .Call('_celda_cG_CalcGibbsProbY', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) } -cG_CalcGibbsProbY_fast <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { - .Call('_celda_cG_CalcGibbsProbY_fast', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) -} - -cG_CalcGibbsProbY_fastVector <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { - .Call('_celda_cG_CalcGibbsProbY_fastVector', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) +cG_CalcGibbsProbY_fastRow <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { + .Call('_celda_cG_CalcGibbsProbY_fastRow', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) } -cG_CalcGibbsProbY_fastVectorFlip <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { - .Call('_celda_cG_CalcGibbsProbY_fastVectorFlip', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) +cG_CalcGibbsProbY_fast <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { + .Call('_celda_cG_CalcGibbsProbY_fast', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) } #' Fast matrix multiplication for double x int diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index e1b69e43..722b8d75 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -50,9 +50,9 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// cG_CalcGibbsProbY_fast -NumericVector cG_CalcGibbsProbY_fast(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); -RcppExport SEXP _celda_cG_CalcGibbsProbY_fast(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { +// cG_CalcGibbsProbY_fastRow +NumericVector cG_CalcGibbsProbY_fastRow(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); +RcppExport SEXP _celda_cG_CalcGibbsProbY_fastRow(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -69,36 +69,13 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const NumericVector& >::type lg_gamma(lg_gammaSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type lg_delta(lg_deltaSEXP); Rcpp::traits::input_parameter< const double >::type delta(deltaSEXP); - rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY_fast(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); + rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY_fastRow(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); return rcpp_result_gen; END_RCPP } -// cG_CalcGibbsProbY_fastVector -NumericVector cG_CalcGibbsProbY_fastVector(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); -RcppExport SEXP _celda_cG_CalcGibbsProbY_fastVector(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const int >::type index(indexSEXP); - Rcpp::traits::input_parameter< const IntegerMatrix& >::type counts(countsSEXP); - Rcpp::traits::input_parameter< const IntegerMatrix& >::type nTSbyC(nTSbyCSEXP); - Rcpp::traits::input_parameter< const IntegerVector& >::type nbyTS(nbyTSSEXP); - Rcpp::traits::input_parameter< const IntegerVector& >::type nGbyTS(nGbyTSSEXP); - Rcpp::traits::input_parameter< const IntegerVector& >::type nbyG(nbyGSEXP); - Rcpp::traits::input_parameter< const IntegerVector& >::type y(ySEXP); - Rcpp::traits::input_parameter< const int >::type L(LSEXP); - Rcpp::traits::input_parameter< const int >::type nG(nGSEXP); - Rcpp::traits::input_parameter< const NumericVector& >::type lg_beta(lg_betaSEXP); - Rcpp::traits::input_parameter< const NumericVector& >::type lg_gamma(lg_gammaSEXP); - Rcpp::traits::input_parameter< const NumericVector& >::type lg_delta(lg_deltaSEXP); - Rcpp::traits::input_parameter< const double >::type delta(deltaSEXP); - rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY_fastVector(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); - return rcpp_result_gen; -END_RCPP -} -// cG_CalcGibbsProbY_fastVectorFlip -NumericVector cG_CalcGibbsProbY_fastVectorFlip(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); -RcppExport SEXP _celda_cG_CalcGibbsProbY_fastVectorFlip(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { +// cG_CalcGibbsProbY_fast +NumericVector cG_CalcGibbsProbY_fast(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); +RcppExport SEXP _celda_cG_CalcGibbsProbY_fast(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -115,7 +92,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const NumericVector& >::type lg_gamma(lg_gammaSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type lg_delta(lg_deltaSEXP); Rcpp::traits::input_parameter< const double >::type delta(deltaSEXP); - rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY_fastVectorFlip(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); + rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY_fast(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); return rcpp_result_gen; END_RCPP } @@ -179,9 +156,8 @@ RcppExport SEXP _rowSumByGroupChange(SEXP, SEXP, SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { {"_celda_cG_calcGibbsProbY_Simple", (DL_FUNC) &_celda_cG_calcGibbsProbY_Simple, 11}, {"_celda_cG_CalcGibbsProbY", (DL_FUNC) &_celda_cG_CalcGibbsProbY, 13}, + {"_celda_cG_CalcGibbsProbY_fastRow", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fastRow, 13}, {"_celda_cG_CalcGibbsProbY_fast", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fast, 13}, - {"_celda_cG_CalcGibbsProbY_fastVector", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fastVector, 13}, - {"_celda_cG_CalcGibbsProbY_fastVectorFlip", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fastVectorFlip, 13}, {"_celda_eigenMatMultInt", (DL_FUNC) &_celda_eigenMatMultInt, 2}, {"_celda_fastNormProp", (DL_FUNC) &_celda_fastNormProp, 2}, {"_celda_fastNormPropLog", (DL_FUNC) &_celda_fastNormPropLog, 2}, diff --git a/src/cG_calcGibbsProbY.cpp b/src/cG_calcGibbsProbY.cpp index 402e91c5..dc8de4b5 100644 --- a/src/cG_calcGibbsProbY.cpp +++ b/src/cG_calcGibbsProbY.cpp @@ -122,7 +122,7 @@ NumericVector cG_CalcGibbsProbY(const int index, // [[Rcpp::export]] -NumericVector cG_CalcGibbsProbY_fast(const int index, +NumericVector cG_CalcGibbsProbY_fastRow(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, @@ -184,74 +184,7 @@ NumericVector cG_CalcGibbsProbY_fast(const int index, // [[Rcpp::export]] -NumericVector cG_CalcGibbsProbY_fastVector(const int index, - const IntegerMatrix& counts, - const IntegerMatrix& nTSbyC, - const IntegerVector& nbyTS, - const IntegerVector& nGbyTS, - const IntegerVector& nbyG, - const IntegerVector& y, - const int L, - const int nG, - const NumericVector& lg_beta, - const NumericVector& lg_gamma, - const NumericVector& lg_delta, - const double delta) { - - int index0 = index - 1; - int current_y = y[index0] - 1; - int i; - int j,k; - - NumericVector probs(L); - - // Calculate probabilities related to the "n.TS.by.C" part of equation one time up front - // The first case of if statement represents when the current feature is already added to that module - // The second case represents when the current feature is NOT YET added to that module - for (i = 0; i < L; i++) { - if (i == current_y ) { - for (int col = 0; col < counts.ncol(); col++) { - j = col * L + i; // Index for the current module in the n.TS.by.C matrix - k = col * nG + index0; // Index for the current feature in counts matrix - probs[i] += lg_beta[nTSbyC[j]]; - probs[i] -= lg_beta[nTSbyC[j] - counts[k]]; - } - } else { - for (int col = 0; col < counts.ncol(); col++) { - j = col * L + i; - k = col * nG + index0; - probs[i] += lg_beta[nTSbyC[j] + counts[k]]; - probs[i] -= lg_beta[nTSbyC[j]]; - } - } - } - - // Calculate the probabilities for each module - // If statements determine whether to add or subtract counts from each probability - for (i = 0; i < L; i++) { - if (i == current_y) { - probs[i] += lg_gamma[nGbyTS[i]]; - probs[i] -= lg_gamma[nGbyTS[i] - 1]; - probs[i] += lg_delta[nGbyTS[i]]; - probs[i] -= lg_delta[nGbyTS[i] - 1]; - probs[i] += lgamma(nbyTS[i] - nbyG[index0] + (nGbyTS[i] - 1) * delta); - probs[i] -= lgamma(nbyTS[i] + nGbyTS[i] * delta); - } else { - probs[i] += lg_gamma[nGbyTS[i] + 1]; - probs[i] -= lg_gamma[nGbyTS[i]]; - probs[i] += lg_delta[nGbyTS[i] + 1]; - probs[i] -= lg_delta[nGbyTS[i]]; - probs[i] += lgamma(nbyTS[i] + nGbyTS[i] * delta); - probs[i] -= lgamma(nbyTS[i] + nbyG[index0] + (nGbyTS[i] + 1) * delta); - } - } - - return(probs); -} - - -// [[Rcpp::export]] -NumericVector cG_CalcGibbsProbY_fastVectorFlip(const int index, +NumericVector cG_CalcGibbsProbY_fast(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, From 33dee49e9bb09429c7b08778cbe94e0447311b8a Mon Sep 17 00:00:00 2001 From: Irisapo Date: Wed, 4 Dec 2019 18:26:19 -0500 Subject: [PATCH 079/149] decontx bug fix no geneNames --- R/decon.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/decon.R b/R/decon.R index 58431383..bc98c1ef 100644 --- a/R/decon.R +++ b/R/decon.R @@ -322,8 +322,8 @@ decontX <- function(counts, haveEmptyGenes = FALSE totalGenes = nrow(counts) noneEmptyGeneIndex = rowSums(counts) != 0 + geneNames = rownames(counts) if (sum(noneEmptyGeneIndex) != totalGenes) { - geneNames = rownames(counts) counts = counts[noneEmptyGeneIndex, ] haveEmptyGenes = TRUE } From 19be159aba151e8b4d0688ba7d8492e1198e73ef Mon Sep 17 00:00:00 2001 From: Irisapo Date: Wed, 4 Dec 2019 19:50:52 -0500 Subject: [PATCH 080/149] code format --- R/decon.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/R/decon.R b/R/decon.R index bc98c1ef..f46fd55e 100644 --- a/R/decon.R +++ b/R/decon.R @@ -318,14 +318,14 @@ decontX <- function(counts, delta = 10, logfile = NULL, verbose = TRUE) { - # empty expression genes won't be used for estimation - haveEmptyGenes = FALSE - totalGenes = nrow(counts) - noneEmptyGeneIndex = rowSums(counts) != 0 + # empty expression genes won't be used for estimation + haveEmptyGenes <- FALSE + totalGenes <- nrow(counts) + noneEmptyGeneIndex <- rowSums(counts) != 0 geneNames = rownames(counts) if (sum(noneEmptyGeneIndex) != totalGenes) { - counts = counts[noneEmptyGeneIndex, ] - haveEmptyGenes = TRUE + counts <- counts[noneEmptyGeneIndex, ] + haveEmptyGenes <- TRUE } if (!is.null(batch)) { @@ -359,13 +359,13 @@ decontX <- function(counts, verbose = verbose ) - if (haveEmptyGenes) { + if (haveEmptyGenes) { estRmat[noneEmptyGeneIndex, batch == bat] <- resBat$resList$estNativeCounts } else { estRmat[, batch == bat] <- resBat$resList$estNativeCounts - } + } estConp[batch == bat] <- resBat$resList$estConp theta[batch == bat] <- resBat$resList$theta @@ -394,7 +394,7 @@ decontX <- function(counts, } # when there is only one batch - resultsOneBatch = .decontXoneBatch( + resultsOneBatch <- .decontXoneBatch( counts = counts, z = z, maxIter = maxIter, @@ -403,10 +403,10 @@ decontX <- function(counts, verbose = verbose ) if (haveEmptyGenes) { - resBat = matrix(0, nrow = totalGenes, ncol = ncol(counts), + resBat <- matrix(0, nrow = totalGenes, ncol = ncol(counts), dimnames = list(geneNames, colnames(counts))) - resBat[noneEmptyGeneIndex, ] = resultsOneBatch$resList$estNativeCounts - resultsOneBatch$resList$estNativeCounts = resBat + resBat[noneEmptyGeneIndex, ] <- resultsOneBatch$resList$estNativeCounts + resultsOneBatch$resList$estNativeCounts <- resBat } return(resultsOneBatch) From 2646d1f5565d7b4bd96fce56a8495f7aaba11050 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Wed, 4 Dec 2019 20:08:55 -0500 Subject: [PATCH 081/149] change cG_calcGibbsProbY* names --- R/RcppExports.R | 8 ++++---- R/celda_G.R | 2 +- src/RcppExports.cpp | 20 ++++++++++---------- src/cG_calcGibbsProbY.cpp | 4 ++-- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 7f1b19a2..4734cebd 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -5,16 +5,16 @@ cG_calcGibbsProbY_Simple <- function(counts, nGbyTS, nTSbyC, nbyTS, nbyG, y, L, .Call('_celda_cG_calcGibbsProbY_Simple', PACKAGE = 'celda', counts, nGbyTS, nTSbyC, nbyTS, nbyG, y, L, index, gamma, beta, delta) } -cG_CalcGibbsProbY <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { - .Call('_celda_cG_CalcGibbsProbY', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) +cG_CalcGibbsProbY_ori <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { + .Call('_celda_cG_CalcGibbsProbY_ori', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) } cG_CalcGibbsProbY_fastRow <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { .Call('_celda_cG_CalcGibbsProbY_fastRow', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) } -cG_CalcGibbsProbY_fast <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { - .Call('_celda_cG_CalcGibbsProbY_fast', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) +cG_CalcGibbsProbY <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { + .Call('_celda_cG_CalcGibbsProbY', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) } #' Fast matrix multiplication for double x int diff --git a/R/celda_G.R b/R/celda_G.R index b6453b58..5f2b566a 100755 --- a/R/celda_G.R +++ b/R/celda_G.R @@ -417,7 +417,7 @@ celda_G <- function(counts, probs <- matrix(NA, ncol = nG, nrow = L) ix <- sample(seq(nG)) for (i in ix) { - probs[, i] <- cG_CalcGibbsProbY_fast( + probs[, i] <- cG_CalcGibbsProbY( index = i, counts = counts, nTSbyC = nTSByC, diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 722b8d75..118e61c4 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -27,9 +27,9 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// cG_CalcGibbsProbY -NumericVector cG_CalcGibbsProbY(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); -RcppExport SEXP _celda_cG_CalcGibbsProbY(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { +// cG_CalcGibbsProbY_ori +NumericVector cG_CalcGibbsProbY_ori(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); +RcppExport SEXP _celda_cG_CalcGibbsProbY_ori(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -46,7 +46,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const NumericVector& >::type lg_gamma(lg_gammaSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type lg_delta(lg_deltaSEXP); Rcpp::traits::input_parameter< const double >::type delta(deltaSEXP); - rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); + rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY_ori(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); return rcpp_result_gen; END_RCPP } @@ -73,9 +73,9 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// cG_CalcGibbsProbY_fast -NumericVector cG_CalcGibbsProbY_fast(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); -RcppExport SEXP _celda_cG_CalcGibbsProbY_fast(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { +// cG_CalcGibbsProbY +NumericVector cG_CalcGibbsProbY(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); +RcppExport SEXP _celda_cG_CalcGibbsProbY(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -92,7 +92,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const NumericVector& >::type lg_gamma(lg_gammaSEXP); Rcpp::traits::input_parameter< const NumericVector& >::type lg_delta(lg_deltaSEXP); Rcpp::traits::input_parameter< const double >::type delta(deltaSEXP); - rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY_fast(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); + rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); return rcpp_result_gen; END_RCPP } @@ -155,9 +155,9 @@ RcppExport SEXP _rowSumByGroupChange(SEXP, SEXP, SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { {"_celda_cG_calcGibbsProbY_Simple", (DL_FUNC) &_celda_cG_calcGibbsProbY_Simple, 11}, - {"_celda_cG_CalcGibbsProbY", (DL_FUNC) &_celda_cG_CalcGibbsProbY, 13}, + {"_celda_cG_CalcGibbsProbY_ori", (DL_FUNC) &_celda_cG_CalcGibbsProbY_ori, 13}, {"_celda_cG_CalcGibbsProbY_fastRow", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fastRow, 13}, - {"_celda_cG_CalcGibbsProbY_fast", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fast, 13}, + {"_celda_cG_CalcGibbsProbY", (DL_FUNC) &_celda_cG_CalcGibbsProbY, 13}, {"_celda_eigenMatMultInt", (DL_FUNC) &_celda_eigenMatMultInt, 2}, {"_celda_fastNormProp", (DL_FUNC) &_celda_fastNormProp, 2}, {"_celda_fastNormPropLog", (DL_FUNC) &_celda_fastNormPropLog, 2}, diff --git a/src/cG_calcGibbsProbY.cpp b/src/cG_calcGibbsProbY.cpp index dc8de4b5..c7a3f609 100644 --- a/src/cG_calcGibbsProbY.cpp +++ b/src/cG_calcGibbsProbY.cpp @@ -50,7 +50,7 @@ NumericVector cG_calcGibbsProbY_Simple(const IntegerMatrix counts, // [[Rcpp::export]] -NumericVector cG_CalcGibbsProbY(const int index, +NumericVector cG_CalcGibbsProbY_ori(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, @@ -184,7 +184,7 @@ NumericVector cG_CalcGibbsProbY_fastRow(const int index, // [[Rcpp::export]] -NumericVector cG_CalcGibbsProbY_fast(const int index, +NumericVector cG_CalcGibbsProbY(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, From ed2ea072ec4f1159326c74400ecccfc6d73daaae Mon Sep 17 00:00:00 2001 From: Irisapo Date: Wed, 4 Dec 2019 20:12:07 -0500 Subject: [PATCH 082/149] code formaaaaaaaaaaaat --- R/decon.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/decon.R b/R/decon.R index f46fd55e..ce0196be 100644 --- a/R/decon.R +++ b/R/decon.R @@ -320,9 +320,9 @@ decontX <- function(counts, verbose = TRUE) { # empty expression genes won't be used for estimation haveEmptyGenes <- FALSE - totalGenes <- nrow(counts) + totalGenes <- nrow(counts) noneEmptyGeneIndex <- rowSums(counts) != 0 - geneNames = rownames(counts) + geneNames <- rownames(counts) if (sum(noneEmptyGeneIndex) != totalGenes) { counts <- counts[noneEmptyGeneIndex, ] haveEmptyGenes <- TRUE @@ -402,7 +402,7 @@ decontX <- function(counts, logfile = logfile, verbose = verbose ) - if (haveEmptyGenes) { + if (haveEmptyGenes) { resBat <- matrix(0, nrow = totalGenes, ncol = ncol(counts), dimnames = list(geneNames, colnames(counts))) resBat[noneEmptyGeneIndex, ] <- resultsOneBatch$resList$estNativeCounts From 2d61766ef7dc462a1da8111a1af39560f6f61161 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Thu, 5 Dec 2019 15:23:53 -0500 Subject: [PATCH 083/149] graph and density-based clustering when no z provided for decontX --- R/decon.R | 172 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 115 insertions(+), 57 deletions(-) diff --git a/R/decon.R b/R/decon.R index ce0196be..440244c3 100644 --- a/R/decon.R +++ b/R/decon.R @@ -263,6 +263,10 @@ simulateContaminatedMatrix <- function(C = 300, #' @param logfile Character. Messages will be redirected to a file named #' `logfile`. If NULL, messages will be printed to stdout. Default NULL. #' @param verbose Logical. Whether to print log messages. Default TRUE. +#' @param varGenes Positive Integer. Used only when z is not provided. +#' Need to be larger than 1. Default value is 5000 if not provided. +#' @param dbscanEps Numeric. Used only when z is not provided. +#' Need to be non-negative. Default is 1.0 if not provided. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. @@ -286,6 +290,8 @@ decontX <- function(counts, delta = 10, logfile = NULL, verbose = TRUE, + varGenes = NULL, + dbscanEps = NULL, seed = 12345) { if (is.null(seed)) { @@ -295,7 +301,9 @@ decontX <- function(counts, maxIter = maxIter, delta = delta, logfile = logfile, - verbose = verbose) + verbose = verbose, + varGenes = varGenes, + dbscanEps = dbscanEps) } else { with_seed(seed, res <- .decontX(counts = counts, @@ -304,7 +312,9 @@ decontX <- function(counts, maxIter = maxIter, delta = delta, logfile = logfile, - verbose = verbose)) + verbose = verbose, + varGenes = varGenes, + dbscanEps = dbscanEps)) } return(res) @@ -317,7 +327,9 @@ decontX <- function(counts, maxIter = 200, delta = 10, logfile = NULL, - verbose = TRUE) { + verbose = TRUE, + varGenes = NULL, + dbscanEps = NULL) { # empty expression genes won't be used for estimation haveEmptyGenes <- FALSE totalGenes <- nrow(counts) @@ -356,7 +368,9 @@ decontX <- function(counts, maxIter = maxIter, delta = delta, logfile = logfile, - verbose = verbose + verbose = verbose, + varGenes = varGenes, + dbscanEps = dbscanEps ) if (haveEmptyGenes) { @@ -400,7 +414,9 @@ decontX <- function(counts, maxIter = maxIter, delta = delta, logfile = logfile, - verbose = verbose + verbose = verbose, + varGenes = varGenes, + dbscanEps = dbscanEps ) if (haveEmptyGenes) { resBat <- matrix(0, nrow = totalGenes, ncol = ncol(counts), @@ -420,7 +436,9 @@ decontX <- function(counts, maxIter = 200, delta = 10, logfile = NULL, - verbose = TRUE) { + verbose = TRUE, + varGenes = NULL, + dbscanEps = NULL) { .checkCountsDecon(counts) .checkParametersDecon(proportionPrior = delta) @@ -429,11 +447,37 @@ decontX <- function(counts, K <- length(unique(z)) if (is.null(z)) { - deconMethod <- "background" + .logMessages( + paste(rep("-", 50), collapse = ""), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + .logMessages( + "Clustering using graph and density-based method to find cell clusters", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + .logMessages( + paste(rep("-", 50), collapse = ""), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + #deconMethod <- "background" + deconMethod <- "clustering" + + varGenes = .processvarGenes(varGenes) + dbscanEps = .processdbscanEps(dbscanEps) + + z <- .decontxInitializeZ(object = counts, + varGenes = varGenes, + dbscanEps = dbscanEps) } else { deconMethod <- "clustering" - z <- .processCellLabels(z, numCells = nC) } + z <- .processCellLabels(z, numCells = nC) iter <- 1L numIterWithoutImprovement <- 0L @@ -645,7 +689,8 @@ decontX <- function(counts, ) runParams <- list("deltaInit" = deltaInit, - "iteration" = iter - 1L) + "iteration" = iter - 1L, + "z" = z) resList <- list( "logLikelihood" = ll, @@ -694,7 +739,7 @@ decontX <- function(counts, " 'counts' matrix.") } if (length(unique(z)) < 2) { - stop("'z' must have at least 2 different values.") # Even though + stop("No need to decontaminate when only one cluster is in the dataset.") # Even though # everything runs smoothly when length(unique(z)) == 1, result is not # trustful } @@ -726,58 +771,71 @@ addLogLikelihood <- function(llA, llB) { ## Initialization of cell labels for DecontX when they are not given .decontxInitializeZ <- - function(counts, - K = 10, - minCell = 3, - seed = 428) { - nC <- ncol(counts) - if (nC < 100) { - K <- ceiling(sqrt(nC)) + function(object, # object is either a sce object or a count matrix + varGenes = 5000, + L = 50, + dbscanEps = 1) { + + if (!is(object, "SingleCellExperiment")) { + object = SingleCellExperiment::SingleCellExperiment(assays = list(object = object)) } - globalZ <- .initializeSplitZ( - counts, - K = K, - KSubcluster = NULL, - alpha = 1, - beta = 1, - minCell = 3 - ) - globalK <- max(globalZ) - - localZ <- rep(NA, nC) - for (k in seq(globalK)) { - if (sum(globalZ == k) > 2) { - localCounts <- counts[, globalZ == k] - localK <- min(K, ceiling(sqrt(ncol( - localCounts - )))) - localZ[globalZ == k] <- .initializeSplitZ( - localCounts, - K = localK, - KSubcluster = NULL, - alpha = 1, - beta = 1, - minCell = 3 - ) - } else { - localZ [globalZ == k] <- 1L - } + # normalize + sce = scater::normalizeSCE(sce) # add the log2 normalized counts into sce object + + if (nrow(sce) <= varGenes) { + topVariableGenes = 1:nrow(sce) + } else if( nrow(sce) > varGenes ) { + # Use the top most variable genes to do rough clustering (celda_CG & Louvian graph algorithm) + mvTrend = scran::trendVar(sce, use.spikes=FALSE) + decomposeTrend = scran::decomposeVar(sce, mvTrend) + topVariableGenes = order(decomposeTrend$bio, decreasing=TRUE)[1:varGenes] + } + countsFiltered = as.matrix(counts(sce[topVariableGenes, ])) + storage.mode(countsFiltered) = "integer" + + # Celda clustering using recursive module splitting + L = min(L, nrow(countsFiltered)) + initial.module.split = recursiveSplitModule(countsFiltered, initialL=L, maxL=L, perplexity=FALSE, verbose=FALSE) + initial.modules.model = subsetCeldaList(initial.module.split, list(L=L)) + + + # Louvian community detection + fm = factorizeMatrix(countsFiltered, initial.modules.model, type="counts") + resUmap = uwot::umap(t(sqrt(fm$counts$cell)), n_neighbors=15, min_dist = 0.01, spread = 1) + + # Use dbSCAN on the UMAP to identify broad cell types + totalClusters = 1 + while(totalClusters <= 1 & dbscan.eps > 0) { + resDbscan = dbscan(resUmap, dbscanEps) + dbscanEps = dbscanEps - (0.25 * dbscanEps) + totalClusters = length(unique(resDbscan$cluster)) } - cbZ <- interaction(globalZ, localZ, lex.order = TRUE, drop = TRUE) - # combined z label - trZ <- as.integer(sub("\\..*", "", levels(cbZ), perl = TRUE)) - # transitional z label + return("z" = resDbscan$cluster) + } - cbZ <- as.integer(plyr::mapvalues(cbZ, from = levels(cbZ), - to = seq(length(levels(cbZ))))) +## process varGenes +.processvarGenes = function(varGenes) { + if (is.null(varGenes)) { + varGenes = 5000 + } else { + if (varGenes < 2 | !is.integer(varGenes)) { + stop("Parameter 'varGenes' must be an integer and larger than 1.") + } + } + return(varGenes) +} - return(list( - "globalZ" = globalZ, - "localZ" = localZ, - "trZ" = trZ, - "cbZ" = cbZ - )) +## process dbscanEps for resolusion threshold using DBSCAN +.processdbscanEps = function(dbscanEps) { + if (is.null(dbscanEps)) { + dbscanEps = 1 + } else { + if (dbscanEps < 0) { + stop("Parameter 'dbscanEps' needs to be non-negative.") + } } + return(dbscanEps) +} From 706c16ac6b9049c8e14102dbe54f7a87b7ee551a Mon Sep 17 00:00:00 2001 From: Irisapo Date: Thu, 5 Dec 2019 16:27:14 -0500 Subject: [PATCH 084/149] bug fix --- DESCRIPTION | 8 ++- R/decon.R | 42 +++++------ man/bestLogLikelihood-celdaModel-method.Rd | 1 - man/celdaGridSearch.Rd | 16 ++++- man/celdaHeatmap-celda_C-method.Rd | 1 - man/celdaHeatmap-celda_CG-method.Rd | 1 - man/celdaHeatmap-celda_G-method.Rd | 1 - man/celdaPerplexity-celdaList-method.Rd | 1 - man/celdaProbabilityMap-celda_C-method.Rd | 4 +- man/celdaProbabilityMap-celda_CG-method.Rd | 9 ++- man/celdaTsne-celda_C-method.Rd | 14 ++-- man/celdaTsne-celda_CG-method.Rd | 15 ++-- man/celdaTsne-celda_G-method.Rd | 15 ++-- man/celdaTsne.Rd | 14 +++- man/celdaUmap-celda_C-method.Rd | 19 +++-- man/celdaUmap-celda_CG-method.Rd | 17 +++-- man/celdaUmap-celda_G-method.Rd | 17 +++-- man/celdaUmap.Rd | 11 ++- man/celda_C.Rd | 23 ++++-- man/celda_CG.Rd | 28 ++++++-- man/celda_G.Rd | 23 ++++-- man/clusterProbability-celda_C-method.Rd | 4 +- man/clusterProbability-celda_CG-method.Rd | 4 +- man/clusterProbability-celda_G-method.Rd | 4 +- man/clusters-celdaModel-method.Rd | 1 - man/countChecksum-celdaList-method.Rd | 1 - man/decontX.Rd | 20 +++++- man/differentialExpression.Rd | 11 ++- man/distinctColors.Rd | 9 ++- man/factorizeMatrix-celda_C-method.Rd | 8 ++- man/factorizeMatrix-celda_CG-method.Rd | 8 ++- man/factorizeMatrix-celda_G-method.Rd | 8 ++- man/factorizeMatrix.Rd | 7 +- man/featureModuleLookup-celda_C-method.Rd | 4 +- man/featureModuleLookup-celda_CG-method.Rd | 4 +- man/featureModuleLookup-celda_G-method.Rd | 4 +- man/findMarkers.Rd | 13 +++- man/logLikelihoodHistory-celdaModel-method.Rd | 1 - man/logLikelihoodcelda_CG.Rd | 14 +++- man/matrixNames-celdaModel-method.Rd | 1 - man/moduleHeatmap.Rd | 13 +++- man/normalizeCounts.Rd | 11 ++- man/params-celdaModel-method.Rd | 1 - man/perplexity-celda_C-method.Rd | 1 - man/perplexity-celda_CG-method.Rd | 1 - man/perplexity-celda_G-method.Rd | 1 - man/plotDendro.Rd | 12 +++- man/plotDimReduceCluster.Rd | 16 +++-- man/plotDimReduceFeature.Rd | 21 ++++-- man/plotDimReduceGrid.Rd | 16 ++++- man/plotDimReduceModule.Rd | 19 +++-- man/plotHeatmap.Rd | 42 ++++++++--- man/recursiveSplitCell.Rd | 21 ++++-- man/recursiveSplitModule.Rd | 21 ++++-- man/resList-celdaList-method.Rd | 1 - man/runParams-celdaList-method.Rd | 1 - man/sampleLabel-celdaModel-method.Rd | 1 - man/semiPheatmap.Rd | 71 ++++++++++++++----- man/simulateCellscelda_C.Rd | 15 +++- man/simulateCellscelda_CG.Rd | 18 ++++- man/simulateCellscelda_G.Rd | 14 +++- man/simulateContaminatedMatrix.Rd | 11 ++- man/topRank.Rd | 3 +- tests/testthat/test-decon.R | 14 ++-- 64 files changed, 535 insertions(+), 216 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ddec3b80..476dc95a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,7 +45,11 @@ Imports: dendextend, ggdendro, pROC, - magrittr + magrittr, + scater, + scran, + SingleCellExperiment, + dbscan Suggests: testthat, knitr, @@ -62,6 +66,6 @@ LinkingTo: Rcpp, RcppEigen License: MIT + file LICENSE Encoding: UTF-8 LazyData: false -RoxygenNote: 6.1.1 +RoxygenNote: 7.0.2 BugReports: https://github.com/campbio/celda/issues biocViews: SingleCell, GeneExpression, Clustering, Sequencing, Bayesian diff --git a/R/decon.R b/R/decon.R index 440244c3..294f1b2a 100644 --- a/R/decon.R +++ b/R/decon.R @@ -444,7 +444,6 @@ decontX <- function(counts, # nG <- nrow(counts) nC <- ncol(counts) - K <- length(unique(z)) if (is.null(z)) { .logMessages( @@ -478,6 +477,7 @@ decontX <- function(counts, deconMethod <- "clustering" } z <- .processCellLabels(z, numCells = nC) + K <- length(unique(z)) iter <- 1L numIterWithoutImprovement <- 0L @@ -777,39 +777,39 @@ addLogLikelihood <- function(llA, llB) { dbscanEps = 1) { if (!is(object, "SingleCellExperiment")) { - object = SingleCellExperiment::SingleCellExperiment(assays = list(object = object)) + sce <- SingleCellExperiment::SingleCellExperiment(assays = list(counts= object)) } - # normalize - sce = scater::normalizeSCE(sce) # add the log2 normalized counts into sce object + # add the log2 normalized counts into sce object + sce <- suppressWarnings(scater::normalizeSCE(sce)) if (nrow(sce) <= varGenes) { - topVariableGenes = 1:nrow(sce) + topVariableGenes <- 1:nrow(sce) } else if( nrow(sce) > varGenes ) { # Use the top most variable genes to do rough clustering (celda_CG & Louvian graph algorithm) - mvTrend = scran::trendVar(sce, use.spikes=FALSE) - decomposeTrend = scran::decomposeVar(sce, mvTrend) - topVariableGenes = order(decomposeTrend$bio, decreasing=TRUE)[1:varGenes] + mvTrend <- scran::trendVar(sce, use.spikes=FALSE) + decomposeTrend <- scran::decomposeVar(sce, mvTrend) + topVariableGenes <- order(decomposeTrend$bio, decreasing=TRUE)[1:varGenes] } - countsFiltered = as.matrix(counts(sce[topVariableGenes, ])) - storage.mode(countsFiltered) = "integer" + countsFiltered <- as.matrix(SingleCellExperiment::counts(sce[topVariableGenes, ])) + storage.mode(countsFiltered) <- "integer" # Celda clustering using recursive module splitting L = min(L, nrow(countsFiltered)) - initial.module.split = recursiveSplitModule(countsFiltered, initialL=L, maxL=L, perplexity=FALSE, verbose=FALSE) - initial.modules.model = subsetCeldaList(initial.module.split, list(L=L)) + initial.module.split <- recursiveSplitModule(countsFiltered, initialL=L, maxL=L, perplexity=FALSE, verbose=FALSE) + initial.modules.model <- subsetCeldaList(initial.module.split, list(L=L)) # Louvian community detection - fm = factorizeMatrix(countsFiltered, initial.modules.model, type="counts") - resUmap = uwot::umap(t(sqrt(fm$counts$cell)), n_neighbors=15, min_dist = 0.01, spread = 1) + fm <- factorizeMatrix(countsFiltered, initial.modules.model, type="counts") + resUmap <- uwot::umap(t(sqrt(fm$counts$cell)), n_neighbors=15, min_dist = 0.01, spread = 1) # Use dbSCAN on the UMAP to identify broad cell types - totalClusters = 1 - while(totalClusters <= 1 & dbscan.eps > 0) { - resDbscan = dbscan(resUmap, dbscanEps) - dbscanEps = dbscanEps - (0.25 * dbscanEps) - totalClusters = length(unique(resDbscan$cluster)) + totalClusters <- 1 + while(totalClusters <= 1 & dbscanEps > 0) { + resDbscan <- dbscan::dbscan(resUmap, dbscanEps) + dbscanEps <- dbscanEps - (0.25 * dbscanEps) + totalClusters <- length(unique(resDbscan$cluster)) } return("z" = resDbscan$cluster) @@ -819,7 +819,7 @@ addLogLikelihood <- function(llA, llB) { ## process varGenes .processvarGenes = function(varGenes) { if (is.null(varGenes)) { - varGenes = 5000 + varGenes <- 5000 } else { if (varGenes < 2 | !is.integer(varGenes)) { stop("Parameter 'varGenes' must be an integer and larger than 1.") @@ -831,7 +831,7 @@ addLogLikelihood <- function(llA, llB) { ## process dbscanEps for resolusion threshold using DBSCAN .processdbscanEps = function(dbscanEps) { if (is.null(dbscanEps)) { - dbscanEps = 1 + dbscanEps <- 1 } else { if (dbscanEps < 0) { stop("Parameter 'dbscanEps' needs to be non-negative.") diff --git a/man/bestLogLikelihood-celdaModel-method.Rd b/man/bestLogLikelihood-celdaModel-method.Rd index fdcf64b0..5497314c 100644 --- a/man/bestLogLikelihood-celdaModel-method.Rd +++ b/man/bestLogLikelihood-celdaModel-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{bestLogLikelihood,celdaModel-method} \alias{bestLogLikelihood,celdaModel-method} \title{Get the log-likelihood} diff --git a/man/celdaGridSearch.Rd b/man/celdaGridSearch.Rd index dbea710c..bd99dd36 100644 --- a/man/celdaGridSearch.Rd +++ b/man/celdaGridSearch.Rd @@ -4,9 +4,19 @@ \alias{celdaGridSearch} \title{Run Celda in parallel with multiple parameters} \usage{ -celdaGridSearch(counts, model, paramsTest, paramsFixed = NULL, - maxIter = 200, nchains = 3, cores = 1, bestOnly = TRUE, - perplexity = TRUE, verbose = TRUE, logfilePrefix = "Celda") +celdaGridSearch( + counts, + model, + paramsTest, + paramsFixed = NULL, + maxIter = 200, + nchains = 3, + cores = 1, + bestOnly = TRUE, + perplexity = TRUE, + verbose = TRUE, + logfilePrefix = "Celda" +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaHeatmap-celda_C-method.Rd b/man/celdaHeatmap-celda_C-method.Rd index 69cf358a..0c31e134 100644 --- a/man/celdaHeatmap-celda_C-method.Rd +++ b/man/celdaHeatmap-celda_C-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{celdaHeatmap,celda_C-method} \alias{celdaHeatmap,celda_C-method} \title{Heatmap for celda_C} diff --git a/man/celdaHeatmap-celda_CG-method.Rd b/man/celdaHeatmap-celda_CG-method.Rd index b98407a7..f2908bf3 100644 --- a/man/celdaHeatmap-celda_CG-method.Rd +++ b/man/celdaHeatmap-celda_CG-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{celdaHeatmap,celda_CG-method} \alias{celdaHeatmap,celda_CG-method} \title{Heatmap for celda_CG} diff --git a/man/celdaHeatmap-celda_G-method.Rd b/man/celdaHeatmap-celda_G-method.Rd index 351acc05..5560653d 100644 --- a/man/celdaHeatmap-celda_G-method.Rd +++ b/man/celdaHeatmap-celda_G-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{celdaHeatmap,celda_G-method} \alias{celdaHeatmap,celda_G-method} \title{Heatmap for celda_CG} diff --git a/man/celdaPerplexity-celdaList-method.Rd b/man/celdaPerplexity-celdaList-method.Rd index cf69c1d4..3854be14 100644 --- a/man/celdaPerplexity-celdaList-method.Rd +++ b/man/celdaPerplexity-celdaList-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{celdaPerplexity,celdaList-method} \alias{celdaPerplexity,celdaList-method} \title{Get perplexity for every model in a celdaList} diff --git a/man/celdaProbabilityMap-celda_C-method.Rd b/man/celdaProbabilityMap-celda_C-method.Rd index 5bf494b9..4d4e04a1 100644 --- a/man/celdaProbabilityMap-celda_C-method.Rd +++ b/man/celdaProbabilityMap-celda_C-method.Rd @@ -1,12 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{celdaProbabilityMap,celda_C-method} \alias{celdaProbabilityMap,celda_C-method} \title{Probability map for a celda_C model} \usage{ -\S4method{celdaProbabilityMap}{celda_C}(counts, celdaMod, - level = c("sample"), ...) +\S4method{celdaProbabilityMap}{celda_C}(counts, celdaMod, level = c("sample"), ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaProbabilityMap-celda_CG-method.Rd b/man/celdaProbabilityMap-celda_CG-method.Rd index fabc06c3..de724ed4 100644 --- a/man/celdaProbabilityMap-celda_CG-method.Rd +++ b/man/celdaProbabilityMap-celda_CG-method.Rd @@ -1,12 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{celdaProbabilityMap,celda_CG-method} \alias{celdaProbabilityMap,celda_CG-method} \title{Probability map for a celda_CG model} \usage{ -\S4method{celdaProbabilityMap}{celda_CG}(counts, celdaMod, - level = c("cellPopulation", "sample"), ...) +\S4method{celdaProbabilityMap}{celda_CG}( + counts, + celdaMod, + level = c("cellPopulation", "sample"), + ... +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaTsne-celda_C-method.Rd b/man/celdaTsne-celda_C-method.Rd index 6e6c9670..e4f0f65a 100644 --- a/man/celdaTsne-celda_C-method.Rd +++ b/man/celdaTsne-celda_C-method.Rd @@ -1,13 +1,19 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{celdaTsne,celda_C-method} \alias{celdaTsne,celda_C-method} \title{tSNE for celda_C} \usage{ -\S4method{celdaTsne}{celda_C}(counts, celdaMod, maxCells = NULL, - minClusterSize = 100, initialDims = 20, perplexity = 20, - maxIter = 2500, seed = 12345) +\S4method{celdaTsne}{celda_C}( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + initialDims = 20, + perplexity = 20, + maxIter = 2500, + seed = 12345 +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaTsne-celda_CG-method.Rd b/man/celdaTsne-celda_CG-method.Rd index 2977940e..f64dba37 100644 --- a/man/celdaTsne-celda_CG-method.Rd +++ b/man/celdaTsne-celda_CG-method.Rd @@ -1,13 +1,20 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{celdaTsne,celda_CG-method} \alias{celdaTsne,celda_CG-method} \title{tSNE for celda_CG} \usage{ -\S4method{celdaTsne}{celda_CG}(counts, celdaMod, maxCells = NULL, - minClusterSize = 100, initialDims = 20, modules = NULL, - perplexity = 20, maxIter = 2500, seed = 12345) +\S4method{celdaTsne}{celda_CG}( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + initialDims = 20, + modules = NULL, + perplexity = 20, + maxIter = 2500, + seed = 12345 +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaTsne-celda_G-method.Rd b/man/celdaTsne-celda_G-method.Rd index 2170fecb..05144e1d 100644 --- a/man/celdaTsne-celda_G-method.Rd +++ b/man/celdaTsne-celda_G-method.Rd @@ -1,13 +1,20 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{celdaTsne,celda_G-method} \alias{celdaTsne,celda_G-method} \title{tSNE for celda_G} \usage{ -\S4method{celdaTsne}{celda_G}(counts, celdaMod, maxCells = NULL, - minClusterSize = 100, initialDims = 20, modules = NULL, - perplexity = 20, maxIter = 2500, seed = 12345) +\S4method{celdaTsne}{celda_G}( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + initialDims = 20, + modules = NULL, + perplexity = 20, + maxIter = 2500, + seed = 12345 +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaTsne.Rd b/man/celdaTsne.Rd index 14b1286f..a9027f18 100644 --- a/man/celdaTsne.Rd +++ b/man/celdaTsne.Rd @@ -4,9 +4,17 @@ \alias{celdaTsne} \title{Embeds cells in two dimensions using tSNE based on celda_CG results.} \usage{ -celdaTsne(counts, celdaMod, maxCells = 25000, minClusterSize = 100, - initialDims = 20, modules = NULL, perplexity = 20, - maxIter = 2500, ...) +celdaTsne( + counts, + celdaMod, + maxCells = 25000, + minClusterSize = 100, + initialDims = 20, + modules = NULL, + perplexity = 20, + maxIter = 2500, + ... +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaUmap-celda_C-method.Rd b/man/celdaUmap-celda_C-method.Rd index b2bfd4b2..f558a5d8 100644 --- a/man/celdaUmap-celda_C-method.Rd +++ b/man/celdaUmap-celda_C-method.Rd @@ -1,14 +1,23 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{celdaUmap,celda_C-method} \alias{celdaUmap,celda_C-method} \title{umap for celda_C} \usage{ -\S4method{celdaUmap}{celda_C}(counts, celdaMod, maxCells = NULL, - minClusterSize = 100, seed = 12345, nNeighbors = 30, - minDist = 0.75, spread = 1, pca = TRUE, initialDims = 50, - cores = 1, ...) +\S4method{celdaUmap}{celda_C}( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + seed = 12345, + nNeighbors = 30, + minDist = 0.75, + spread = 1, + pca = TRUE, + initialDims = 50, + cores = 1, + ... +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaUmap-celda_CG-method.Rd b/man/celdaUmap-celda_CG-method.Rd index 2bfd3775..bfe4165d 100644 --- a/man/celdaUmap-celda_CG-method.Rd +++ b/man/celdaUmap-celda_CG-method.Rd @@ -1,13 +1,22 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{celdaUmap,celda_CG-method} \alias{celdaUmap,celda_CG-method} \title{umap for celda_CG} \usage{ -\S4method{celdaUmap}{celda_CG}(counts, celdaMod, maxCells = NULL, - minClusterSize = 100, modules = NULL, seed = 12345, - nNeighbors = 30, minDist = 0.75, spread = 1, cores = 1, ...) +\S4method{celdaUmap}{celda_CG}( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + modules = NULL, + seed = 12345, + nNeighbors = 30, + minDist = 0.75, + spread = 1, + cores = 1, + ... +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaUmap-celda_G-method.Rd b/man/celdaUmap-celda_G-method.Rd index 0a3aa2cf..9852ffc6 100644 --- a/man/celdaUmap-celda_G-method.Rd +++ b/man/celdaUmap-celda_G-method.Rd @@ -1,13 +1,22 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{celdaUmap,celda_G-method} \alias{celdaUmap,celda_G-method} \title{umap for celda_G} \usage{ -\S4method{celdaUmap}{celda_G}(counts, celdaMod, maxCells = NULL, - minClusterSize = 100, modules = NULL, seed = 12345, - nNeighbors = 30, minDist = 0.2, spread = 1, cores = 1, ...) +\S4method{celdaUmap}{celda_G}( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + modules = NULL, + seed = 12345, + nNeighbors = 30, + minDist = 0.2, + spread = 1, + cores = 1, + ... +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaUmap.Rd b/man/celdaUmap.Rd index e0a5dcd6..d4ff139d 100644 --- a/man/celdaUmap.Rd +++ b/man/celdaUmap.Rd @@ -4,8 +4,15 @@ \alias{celdaUmap} \title{Embeds cells in two dimensions using umap.} \usage{ -celdaUmap(counts, celdaMod, maxCells = NULL, minClusterSize = 100, - modules = NULL, seed = 12345, ...) +celdaUmap( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + modules = NULL, + seed = 12345, + ... +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celda_C.Rd b/man/celda_C.Rd index 654389bf..0c33c93b 100644 --- a/man/celda_C.Rd +++ b/man/celda_C.Rd @@ -4,12 +4,25 @@ \alias{celda_C} \title{Cell clustering with Celda} \usage{ -celda_C(counts, sampleLabel = NULL, K, alpha = 1, beta = 1, - algorithm = c("EM", "Gibbs"), stopIter = 10, maxIter = 200, - splitOnIter = 10, splitOnLast = TRUE, seed = 12345, nchains = 3, +celda_C( + counts, + sampleLabel = NULL, + K, + alpha = 1, + beta = 1, + algorithm = c("EM", "Gibbs"), + stopIter = 10, + maxIter = 200, + splitOnIter = 10, + splitOnLast = TRUE, + seed = 12345, + nchains = 3, zInitialize = c("split", "random", "predefined"), - countChecksum = NULL, zInit = NULL, logfile = NULL, - verbose = TRUE) + countChecksum = NULL, + zInit = NULL, + logfile = NULL, + verbose = TRUE +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celda_CG.Rd b/man/celda_CG.Rd index 4d36d5cb..c2fc946d 100644 --- a/man/celda_CG.Rd +++ b/man/celda_CG.Rd @@ -4,14 +4,30 @@ \alias{celda_CG} \title{Cell and feature clustering with Celda} \usage{ -celda_CG(counts, sampleLabel = NULL, K, L, alpha = 1, beta = 1, - delta = 1, gamma = 1, algorithm = c("EM", "Gibbs"), - stopIter = 10, maxIter = 200, splitOnIter = 10, - splitOnLast = TRUE, seed = 12345, nchains = 3, +celda_CG( + counts, + sampleLabel = NULL, + K, + L, + alpha = 1, + beta = 1, + delta = 1, + gamma = 1, + algorithm = c("EM", "Gibbs"), + stopIter = 10, + maxIter = 200, + splitOnIter = 10, + splitOnLast = TRUE, + seed = 12345, + nchains = 3, zInitialize = c("split", "random", "predefined"), yInitialize = c("split", "random", "predefined"), - countChecksum = NULL, zInit = NULL, yInit = NULL, logfile = NULL, - verbose = TRUE) + countChecksum = NULL, + zInit = NULL, + yInit = NULL, + logfile = NULL, + verbose = TRUE +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celda_G.Rd b/man/celda_G.Rd index 520102ca..85dcfa87 100644 --- a/man/celda_G.Rd +++ b/man/celda_G.Rd @@ -4,11 +4,24 @@ \alias{celda_G} \title{Feature clustering with Celda} \usage{ -celda_G(counts, L, beta = 1, delta = 1, gamma = 1, stopIter = 10, - maxIter = 200, splitOnIter = 10, splitOnLast = TRUE, - seed = 12345, nchains = 3, yInitialize = c("split", "random", - "predefined"), countChecksum = NULL, yInit = NULL, logfile = NULL, - verbose = TRUE) +celda_G( + counts, + L, + beta = 1, + delta = 1, + gamma = 1, + stopIter = 10, + maxIter = 200, + splitOnIter = 10, + splitOnLast = TRUE, + seed = 12345, + nchains = 3, + yInitialize = c("split", "random", "predefined"), + countChecksum = NULL, + yInit = NULL, + logfile = NULL, + verbose = TRUE +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/clusterProbability-celda_C-method.Rd b/man/clusterProbability-celda_C-method.Rd index 16dbf199..16a165c6 100644 --- a/man/clusterProbability-celda_C-method.Rd +++ b/man/clusterProbability-celda_C-method.Rd @@ -1,13 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{clusterProbability,celda_C-method} \alias{clusterProbability,celda_C-method} \title{Conditional probabilities for cells in subpopulations from a Celda_C model} \usage{ -\S4method{clusterProbability}{celda_C}(counts, celdaMod, log = FALSE, - ...) +\S4method{clusterProbability}{celda_C}(counts, celdaMod, log = FALSE, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/clusterProbability-celda_CG-method.Rd b/man/clusterProbability-celda_CG-method.Rd index 98a80896..c5828a7a 100644 --- a/man/clusterProbability-celda_CG-method.Rd +++ b/man/clusterProbability-celda_CG-method.Rd @@ -1,13 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{clusterProbability,celda_CG-method} \alias{clusterProbability,celda_CG-method} \title{Conditional probabilities for cells and features from a Celda_CG model} \usage{ -\S4method{clusterProbability}{celda_CG}(counts, celdaMod, log = FALSE, - ...) +\S4method{clusterProbability}{celda_CG}(counts, celdaMod, log = FALSE, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/clusterProbability-celda_G-method.Rd b/man/clusterProbability-celda_G-method.Rd index 8eb2c673..54826b5e 100644 --- a/man/clusterProbability-celda_G-method.Rd +++ b/man/clusterProbability-celda_G-method.Rd @@ -1,12 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{clusterProbability,celda_G-method} \alias{clusterProbability,celda_G-method} \title{Conditional probabilities for features in modules from a Celda_G model} \usage{ -\S4method{clusterProbability}{celda_G}(counts, celdaMod, log = FALSE, - ...) +\S4method{clusterProbability}{celda_G}(counts, celdaMod, log = FALSE, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/clusters-celdaModel-method.Rd b/man/clusters-celdaModel-method.Rd index c8110f06..fcfdde8e 100644 --- a/man/clusters-celdaModel-method.Rd +++ b/man/clusters-celdaModel-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{clusters,celdaModel-method} \alias{clusters,celdaModel-method} \title{Get clustering outcomes from a celdaModel} diff --git a/man/countChecksum-celdaList-method.Rd b/man/countChecksum-celdaList-method.Rd index b6f683a8..8fcb23d8 100644 --- a/man/countChecksum-celdaList-method.Rd +++ b/man/countChecksum-celdaList-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{countChecksum,celdaList-method} \alias{countChecksum,celdaList-method} \title{Get the MD5 hash of the count matrix from the celdaList} diff --git a/man/decontX.Rd b/man/decontX.Rd index d0222870..145f06c6 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -4,8 +4,18 @@ \alias{decontX} \title{Decontaminate count matrix} \usage{ -decontX(counts, z = NULL, batch = NULL, maxIter = 200, delta = 10, - logfile = NULL, verbose = TRUE, seed = 12345) +decontX( + counts, + z = NULL, + batch = NULL, + maxIter = 200, + delta = 10, + logfile = NULL, + verbose = TRUE, + varGenes = NULL, + dbscanEps = NULL, + seed = 12345 +) } \arguments{ \item{counts}{Numeric/Integer matrix. Observed count matrix, rows represent @@ -26,6 +36,12 @@ to be 10.} \item{verbose}{Logical. Whether to print log messages. Default TRUE.} +\item{varGenes}{Positive Integer. Used only when z is not provided. +Need to be larger than 1. Default value is 5000 if not provided.} + +\item{dbscanEps}{Numeric. Used only when z is not provided. +Need to be non-negative. Default is 1.0 if not provided.} + \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} diff --git a/man/differentialExpression.Rd b/man/differentialExpression.Rd index 698a56b4..3fdf5102 100644 --- a/man/differentialExpression.Rd +++ b/man/differentialExpression.Rd @@ -4,8 +4,15 @@ \alias{differentialExpression} \title{Differential expression for cell subpopulations using MAST} \usage{ -differentialExpression(counts, celdaMod, c1, c2 = NULL, - onlyPos = FALSE, log2fcThreshold = NULL, fdrThreshold = 1) +differentialExpression( + counts, + celdaMod, + c1, + c2 = NULL, + onlyPos = FALSE, + log2fcThreshold = NULL, + fdrThreshold = 1 +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/distinctColors.Rd b/man/distinctColors.Rd index 4fc38644..13cf5b60 100644 --- a/man/distinctColors.Rd +++ b/man/distinctColors.Rd @@ -4,9 +4,12 @@ \alias{distinctColors} \title{Create a color palette} \usage{ -distinctColors(n, hues = c("red", "cyan", "orange", "blue", "yellow", - "purple", "green", "magenta"), saturationRange = c(0.7, 1), - valueRange = c(0.7, 1)) +distinctColors( + n, + hues = c("red", "cyan", "orange", "blue", "yellow", "purple", "green", "magenta"), + saturationRange = c(0.7, 1), + valueRange = c(0.7, 1) +) } \arguments{ \item{n}{Integer. Number of colors to generate.} diff --git a/man/factorizeMatrix-celda_C-method.Rd b/man/factorizeMatrix-celda_C-method.Rd index d2a29ef9..244b90f8 100644 --- a/man/factorizeMatrix-celda_C-method.Rd +++ b/man/factorizeMatrix-celda_C-method.Rd @@ -1,12 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{factorizeMatrix,celda_C-method} \alias{factorizeMatrix,celda_C-method} \title{Matrix factorization for results from celda_C()} \usage{ -\S4method{factorizeMatrix}{celda_C}(counts, celdaMod, type = c("counts", - "proportion", "posterior")) +\S4method{factorizeMatrix}{celda_C}( + counts, + celdaMod, + type = c("counts", "proportion", "posterior") +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/factorizeMatrix-celda_CG-method.Rd b/man/factorizeMatrix-celda_CG-method.Rd index eba62840..8543d614 100644 --- a/man/factorizeMatrix-celda_CG-method.Rd +++ b/man/factorizeMatrix-celda_CG-method.Rd @@ -1,12 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{factorizeMatrix,celda_CG-method} \alias{factorizeMatrix,celda_CG-method} \title{Matrix factorization for results from celda_CG} \usage{ -\S4method{factorizeMatrix}{celda_CG}(counts, celdaMod, type = c("counts", - "proportion", "posterior")) +\S4method{factorizeMatrix}{celda_CG}( + counts, + celdaMod, + type = c("counts", "proportion", "posterior") +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/factorizeMatrix-celda_G-method.Rd b/man/factorizeMatrix-celda_G-method.Rd index 9b5d2dad..7dbdd9de 100644 --- a/man/factorizeMatrix-celda_G-method.Rd +++ b/man/factorizeMatrix-celda_G-method.Rd @@ -1,12 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{factorizeMatrix,celda_G-method} \alias{factorizeMatrix,celda_G-method} \title{Matrix factorization for results from celda_G} \usage{ -\S4method{factorizeMatrix}{celda_G}(counts, celdaMod, type = c("counts", - "proportion", "posterior")) +\S4method{factorizeMatrix}{celda_G}( + counts, + celdaMod, + type = c("counts", "proportion", "posterior") +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/factorizeMatrix.Rd b/man/factorizeMatrix.Rd index e906ee71..35487343 100644 --- a/man/factorizeMatrix.Rd +++ b/man/factorizeMatrix.Rd @@ -5,8 +5,11 @@ \title{Generate factorized matrices showing each feature's influence on cell / gene clustering} \usage{ -factorizeMatrix(counts, celdaMod, type = c("counts", "proportion", - "posterior")) +factorizeMatrix( + counts, + celdaMod, + type = c("counts", "proportion", "posterior") +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/featureModuleLookup-celda_C-method.Rd b/man/featureModuleLookup-celda_C-method.Rd index 07eeb520..2d15b0cf 100644 --- a/man/featureModuleLookup-celda_C-method.Rd +++ b/man/featureModuleLookup-celda_C-method.Rd @@ -1,12 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{featureModuleLookup,celda_C-method} \alias{featureModuleLookup,celda_C-method} \title{Lookup the module of a feature} \usage{ -\S4method{featureModuleLookup}{celda_C}(counts, celdaMod, feature, - exactMatch = TRUE) +\S4method{featureModuleLookup}{celda_C}(counts, celdaMod, feature, exactMatch = TRUE) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/featureModuleLookup-celda_CG-method.Rd b/man/featureModuleLookup-celda_CG-method.Rd index d16c9a3f..7404cc03 100644 --- a/man/featureModuleLookup-celda_CG-method.Rd +++ b/man/featureModuleLookup-celda_CG-method.Rd @@ -1,12 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{featureModuleLookup,celda_CG-method} \alias{featureModuleLookup,celda_CG-method} \title{Lookup the module of a feature} \usage{ -\S4method{featureModuleLookup}{celda_CG}(counts, celdaMod, feature, - exactMatch = TRUE) +\S4method{featureModuleLookup}{celda_CG}(counts, celdaMod, feature, exactMatch = TRUE) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/featureModuleLookup-celda_G-method.Rd b/man/featureModuleLookup-celda_G-method.Rd index b4b5274d..b2481c08 100644 --- a/man/featureModuleLookup-celda_G-method.Rd +++ b/man/featureModuleLookup-celda_G-method.Rd @@ -1,12 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{featureModuleLookup,celda_G-method} \alias{featureModuleLookup,celda_G-method} \title{Lookup the module of a feature} \usage{ -\S4method{featureModuleLookup}{celda_G}(counts, celdaMod, feature, - exactMatch = TRUE) +\S4method{featureModuleLookup}{celda_G}(counts, celdaMod, feature, exactMatch = TRUE) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/findMarkers.Rd b/man/findMarkers.Rd index 113c5ac4..403315ce 100644 --- a/man/findMarkers.Rd +++ b/man/findMarkers.Rd @@ -4,9 +4,16 @@ \alias{findMarkers} \title{Generate decision tree from single-cell clustering output.} \usage{ -findMarkers(features, class, cellTypes, oneoffMetric = c("modified F1", - "pairwise AUC"), threshold = 0.95, reuseFeatures = FALSE, - altSplit = TRUE, consecutiveOneoff = TRUE) +findMarkers( + features, + class, + cellTypes, + oneoffMetric = c("modified F1", "pairwise AUC"), + threshold = 0.95, + reuseFeatures = FALSE, + altSplit = TRUE, + consecutiveOneoff = TRUE +) } \arguments{ \item{features}{A L(features) by N(samples) numeric matrix.} diff --git a/man/logLikelihoodHistory-celdaModel-method.Rd b/man/logLikelihoodHistory-celdaModel-method.Rd index 06ceffbf..ce378406 100644 --- a/man/logLikelihoodHistory-celdaModel-method.Rd +++ b/man/logLikelihoodHistory-celdaModel-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{logLikelihoodHistory,celdaModel-method} \alias{logLikelihoodHistory,celdaModel-method} \title{Get log-likelihood history} diff --git a/man/logLikelihoodcelda_CG.Rd b/man/logLikelihoodcelda_CG.Rd index 7290168e..e99dff79 100644 --- a/man/logLikelihoodcelda_CG.Rd +++ b/man/logLikelihoodcelda_CG.Rd @@ -4,8 +4,18 @@ \alias{logLikelihoodcelda_CG} \title{Calculate Celda_CG log likelihood} \usage{ -logLikelihoodcelda_CG(counts, sampleLabel, z, y, K, L, alpha, beta, delta, - gamma) +logLikelihoodcelda_CG( + counts, + sampleLabel, + z, + y, + K, + L, + alpha, + beta, + delta, + gamma +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/matrixNames-celdaModel-method.Rd b/man/matrixNames-celdaModel-method.Rd index 9c8556a7..fcfbb563 100644 --- a/man/matrixNames-celdaModel-method.Rd +++ b/man/matrixNames-celdaModel-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{matrixNames,celdaModel-method} \alias{matrixNames,celdaModel-method} \title{Get feature, cell and sample names from a celdaModel} diff --git a/man/moduleHeatmap.Rd b/man/moduleHeatmap.Rd index fab5aa9b..6307d967 100644 --- a/man/moduleHeatmap.Rd +++ b/man/moduleHeatmap.Rd @@ -4,9 +4,16 @@ \alias{moduleHeatmap} \title{Heatmap for featureModules} \usage{ -moduleHeatmap(counts, celdaMod, featureModule = 1, topCells = 100, - topFeatures = NULL, normalizedCounts = NA, scaleRow = scale, - showFeaturenames = TRUE) +moduleHeatmap( + counts, + celdaMod, + featureModule = 1, + topCells = 100, + topFeatures = NULL, + normalizedCounts = NA, + scaleRow = scale, + showFeaturenames = TRUE +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/normalizeCounts.Rd b/man/normalizeCounts.Rd index bb63644d..3bc33b8a 100644 --- a/man/normalizeCounts.Rd +++ b/man/normalizeCounts.Rd @@ -4,9 +4,14 @@ \alias{normalizeCounts} \title{Normalization of count data} \usage{ -normalizeCounts(counts, normalize = c("proportion", "cpm", "median", - "mean"), transformationFun = NULL, scaleFun = NULL, - pseudocountNormalize = 0, pseudocountTransform = 0) +normalizeCounts( + counts, + normalize = c("proportion", "cpm", "median", "mean"), + transformationFun = NULL, + scaleFun = NULL, + pseudocountNormalize = 0, + pseudocountTransform = 0 +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/params-celdaModel-method.Rd b/man/params-celdaModel-method.Rd index 740dfa0f..7715228a 100644 --- a/man/params-celdaModel-method.Rd +++ b/man/params-celdaModel-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{params,celdaModel-method} \alias{params,celdaModel-method} \title{Get parameter values provided for celdaModel creation} diff --git a/man/perplexity-celda_C-method.Rd b/man/perplexity-celda_C-method.Rd index 79cd82c8..18b60daa 100644 --- a/man/perplexity-celda_C-method.Rd +++ b/man/perplexity-celda_C-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{perplexity,celda_C-method} \alias{perplexity,celda_C-method} \title{Calculate the perplexity on new data with a celda_C model} diff --git a/man/perplexity-celda_CG-method.Rd b/man/perplexity-celda_CG-method.Rd index d6861632..89d1e9e5 100644 --- a/man/perplexity-celda_CG-method.Rd +++ b/man/perplexity-celda_CG-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{perplexity,celda_CG-method} \alias{perplexity,celda_CG-method} \title{Calculate the perplexity on new data with a celda_CG model} diff --git a/man/perplexity-celda_G-method.Rd b/man/perplexity-celda_G-method.Rd index ffbc5967..7db11951 100644 --- a/man/perplexity-celda_G-method.Rd +++ b/man/perplexity-celda_G-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{perplexity,celda_G-method} \alias{perplexity,celda_G-method} \title{Calculate the perplexity on new data with a celda_G model} diff --git a/man/plotDendro.Rd b/man/plotDendro.Rd index efa2e14b..d9106f55 100644 --- a/man/plotDendro.Rd +++ b/man/plotDendro.Rd @@ -4,9 +4,15 @@ \alias{plotDendro} \title{Plots dendrogram of `findMarkers` output} \usage{ -plotDendro(decisionTree, classLabel = NULL, addSensPrec = FALSE, - maxFeaturePrint = 4, leafSize = 24, boxSize = 7, - boxColor = "black") +plotDendro( + decisionTree, + classLabel = NULL, + addSensPrec = FALSE, + maxFeaturePrint = 4, + leafSize = 24, + boxSize = 7, + boxColor = "black" +) } \arguments{ \item{decisionTree}{List object. The output of `celda::findMarkers`.} diff --git a/man/plotDimReduceCluster.Rd b/man/plotDimReduceCluster.Rd index a6e9dd11..e8bd96ee 100644 --- a/man/plotDimReduceCluster.Rd +++ b/man/plotDimReduceCluster.Rd @@ -4,10 +4,18 @@ \alias{plotDimReduceCluster} \title{Plotting the cell labels on a dimensionality reduction plot} \usage{ -plotDimReduceCluster(dim1, dim2, cluster, size = 1, - xlab = "Dimension_1", ylab = "Dimension_2", - specificClusters = NULL, labelClusters = FALSE, groupBy = NULL, - labelSize = 3.5) +plotDimReduceCluster( + dim1, + dim2, + cluster, + size = 1, + xlab = "Dimension_1", + ylab = "Dimension_2", + specificClusters = NULL, + labelClusters = FALSE, + groupBy = NULL, + labelSize = 3.5 +) } \arguments{ \item{dim1}{Numeric vector. First dimension from data diff --git a/man/plotDimReduceFeature.Rd b/man/plotDimReduceFeature.Rd index 6f46fc23..871b356b 100644 --- a/man/plotDimReduceFeature.Rd +++ b/man/plotDimReduceFeature.Rd @@ -4,10 +4,23 @@ \alias{plotDimReduceFeature} \title{Plotting feature expression on a dimensionality reduction plot} \usage{ -plotDimReduceFeature(dim1, dim2, counts, features, headers = NULL, - normalize = TRUE, exactMatch = TRUE, trim = c(-2, 2), size = 1, - xlab = "Dimension_1", ylab = "Dimension_2", colorLow = "blue", - colorMid = "white", colorHigh = "red", ncol = NULL) +plotDimReduceFeature( + dim1, + dim2, + counts, + features, + headers = NULL, + normalize = TRUE, + exactMatch = TRUE, + trim = c(-2, 2), + size = 1, + xlab = "Dimension_1", + ylab = "Dimension_2", + colorLow = "blue", + colorMid = "white", + colorHigh = "red", + ncol = NULL +) } \arguments{ \item{dim1}{Numeric vector. First dimension from data diff --git a/man/plotDimReduceGrid.Rd b/man/plotDimReduceGrid.Rd index fd0fc390..1f9b328c 100644 --- a/man/plotDimReduceGrid.Rd +++ b/man/plotDimReduceGrid.Rd @@ -4,8 +4,20 @@ \alias{plotDimReduceGrid} \title{Mapping the dimensionality reduction plot} \usage{ -plotDimReduceGrid(dim1, dim2, matrix, size, xlab, ylab, colorLow, colorMid, - colorHigh, varLabel, ncol = NULL, headers = NULL) +plotDimReduceGrid( + dim1, + dim2, + matrix, + size, + xlab, + ylab, + colorLow, + colorMid, + colorHigh, + varLabel, + ncol = NULL, + headers = NULL +) } \arguments{ \item{dim1}{Numeric vector. First dimension from data dimensionality diff --git a/man/plotDimReduceModule.Rd b/man/plotDimReduceModule.Rd index 0373bb13..8ab5f99d 100644 --- a/man/plotDimReduceModule.Rd +++ b/man/plotDimReduceModule.Rd @@ -5,10 +5,21 @@ \title{Plotting the Celda module probability on a dimensionality reduction plot} \usage{ -plotDimReduceModule(dim1, dim2, counts, celdaMod, modules = NULL, - rescale = TRUE, size = 1, xlab = "Dimension_1", - ylab = "Dimension_2", colorLow = "grey", colorMid = NULL, - colorHigh = "blue", ncol = NULL) +plotDimReduceModule( + dim1, + dim2, + counts, + celdaMod, + modules = NULL, + rescale = TRUE, + size = 1, + xlab = "Dimension_1", + ylab = "Dimension_2", + colorLow = "grey", + colorMid = NULL, + colorHigh = "blue", + ncol = NULL +) } \arguments{ \item{dim1}{Numeric vector. diff --git a/man/plotHeatmap.Rd b/man/plotHeatmap.Rd index 387e6191..9dbac3a2 100644 --- a/man/plotHeatmap.Rd +++ b/man/plotHeatmap.Rd @@ -4,18 +4,38 @@ \alias{plotHeatmap} \title{Plots heatmap based on Celda model} \usage{ -plotHeatmap(counts, z = NULL, y = NULL, rowGroupOrder = NULL, - colGroupOrder = NULL, scaleRow = scale, trim = c(-2, 2), - featureIx = NULL, cellIx = NULL, clusterFeature = TRUE, - clusterCell = TRUE, colorScheme = c("divergent", "sequential"), - colorSchemeSymmetric = TRUE, colorSchemeCenter = 0, col = NULL, - annotationCell = NULL, annotationFeature = NULL, - annotationColor = NULL, breaks = NULL, legend = TRUE, - annotationLegend = TRUE, annotationNamesFeature = TRUE, - annotationNamesCell = TRUE, showNamesFeature = FALSE, - showNamesCell = FALSE, hclustMethod = "ward.D2", +plotHeatmap( + counts, + z = NULL, + y = NULL, + rowGroupOrder = NULL, + colGroupOrder = NULL, + scaleRow = scale, + trim = c(-2, 2), + featureIx = NULL, + cellIx = NULL, + clusterFeature = TRUE, + clusterCell = TRUE, + colorScheme = c("divergent", "sequential"), + colorSchemeSymmetric = TRUE, + colorSchemeCenter = 0, + col = NULL, + annotationCell = NULL, + annotationFeature = NULL, + annotationColor = NULL, + breaks = NULL, + legend = TRUE, + annotationLegend = TRUE, + annotationNamesFeature = TRUE, + annotationNamesCell = TRUE, + showNamesFeature = FALSE, + showNamesCell = FALSE, + hclustMethod = "ward.D2", treeheightFeature = ifelse(clusterFeature, 50, 0), - treeheightCell = ifelse(clusterCell, 50, 0), silent = FALSE, ...) + treeheightCell = ifelse(clusterCell, 50, 0), + silent = FALSE, + ... +) } \arguments{ \item{counts}{Numeric matrix. Normalized counts matrix where rows represent diff --git a/man/recursiveSplitCell.Rd b/man/recursiveSplitCell.Rd index ad215cd6..1d8acf15 100644 --- a/man/recursiveSplitCell.Rd +++ b/man/recursiveSplitCell.Rd @@ -4,10 +4,23 @@ \alias{recursiveSplitCell} \title{Recursive cell splitting} \usage{ -recursiveSplitCell(counts, sampleLabel = NULL, initialK = 5, - maxK = 25, tempL = NULL, yInit = NULL, alpha = 1, beta = 1, - delta = 1, gamma = 1, minCell = 3, reorder = TRUE, - perplexity = TRUE, logfile = NULL, verbose = TRUE) +recursiveSplitCell( + counts, + sampleLabel = NULL, + initialK = 5, + maxK = 25, + tempL = NULL, + yInit = NULL, + alpha = 1, + beta = 1, + delta = 1, + gamma = 1, + minCell = 3, + reorder = TRUE, + perplexity = TRUE, + logfile = NULL, + verbose = TRUE +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/recursiveSplitModule.Rd b/man/recursiveSplitModule.Rd index b9943642..c397d8b9 100644 --- a/man/recursiveSplitModule.Rd +++ b/man/recursiveSplitModule.Rd @@ -4,10 +4,23 @@ \alias{recursiveSplitModule} \title{Recursive module splitting} \usage{ -recursiveSplitModule(counts, initialL = 10, maxL = 100, tempK = 100, - zInit = NULL, sampleLabel = NULL, alpha = 1, beta = 1, - delta = 1, gamma = 1, minFeature = 3, reorder = TRUE, - perplexity = TRUE, verbose = TRUE, logfile = NULL) +recursiveSplitModule( + counts, + initialL = 10, + maxL = 100, + tempK = 100, + zInit = NULL, + sampleLabel = NULL, + alpha = 1, + beta = 1, + delta = 1, + gamma = 1, + minFeature = 3, + reorder = TRUE, + perplexity = TRUE, + verbose = TRUE, + logfile = NULL +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/resList-celdaList-method.Rd b/man/resList-celdaList-method.Rd index 5b622841..3b644935 100644 --- a/man/resList-celdaList-method.Rd +++ b/man/resList-celdaList-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{resList,celdaList-method} \alias{resList,celdaList-method} \title{Get final celdaModels from a celdaList} diff --git a/man/runParams-celdaList-method.Rd b/man/runParams-celdaList-method.Rd index 29228807..8b4c84f8 100644 --- a/man/runParams-celdaList-method.Rd +++ b/man/runParams-celdaList-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{runParams,celdaList-method} \alias{runParams,celdaList-method} \title{Get run parameters provided to `celdaGridSearch()`} diff --git a/man/sampleLabel-celdaModel-method.Rd b/man/sampleLabel-celdaModel-method.Rd index 049266dc..1e39a984 100644 --- a/man/sampleLabel-celdaModel-method.Rd +++ b/man/sampleLabel-celdaModel-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{sampleLabel,celdaModel-method} \alias{sampleLabel,celdaModel-method} \title{Get sampleLabels from a celdaModel} diff --git a/man/semiPheatmap.Rd b/man/semiPheatmap.Rd index dcfb9d3f..99d2baed 100644 --- a/man/semiPheatmap.Rd +++ b/man/semiPheatmap.Rd @@ -4,27 +4,60 @@ \alias{semiPheatmap} \title{A function to draw clustered heatmaps.} \usage{ -semiPheatmap(mat, color = colorRampPalette(rev(brewer.pal(n = 7, name = - "RdYlBu")))(100), kmeansK = NA, breaks = NA, - borderColor = "grey60", cellWidth = NA, cellHeight = NA, - scale = "none", clusterRows = TRUE, clusterCols = TRUE, +semiPheatmap( + mat, + color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100), + kmeansK = NA, + breaks = NA, + borderColor = "grey60", + cellWidth = NA, + cellHeight = NA, + scale = "none", + clusterRows = TRUE, + clusterCols = TRUE, clusteringDistanceRows = "euclidean", - clusteringDistanceCols = "euclidean", clusteringMethod = "complete", - clusteringCallback = .identity2, cutreeRows = NA, cutreeCols = NA, + clusteringDistanceCols = "euclidean", + clusteringMethod = "complete", + clusteringCallback = .identity2, + cutreeRows = NA, + cutreeCols = NA, treeHeightRow = ifelse(clusterRows, 50, 0), - treeHeightCol = ifelse(clusterCols, 50, 0), legend = TRUE, - legendBreaks = NA, legendLabels = NA, annotationRow = NA, - annotationCol = NA, annotation = NA, annotationColors = NA, - annotationLegend = TRUE, annotationNamesRow = TRUE, - annotationNamesCol = TRUE, dropLevels = TRUE, showRownames = TRUE, - showColnames = TRUE, main = NA, fontSize = 10, - fontSizeRow = fontSize, fontSizeCol = fontSize, - displayNumbers = FALSE, numberFormat = "\%.2f", - numberColor = "grey30", fontSizeNumber = 0.8 * fontSize, - gapsRow = NULL, gapsCol = NULL, labelsRow = NULL, - labelsCol = NULL, fileName = NA, width = NA, height = NA, - silent = FALSE, rowLabel, colLabel, rowGroupOrder = NULL, - colGroupOrder = NULL, ...) + treeHeightCol = ifelse(clusterCols, 50, 0), + legend = TRUE, + legendBreaks = NA, + legendLabels = NA, + annotationRow = NA, + annotationCol = NA, + annotation = NA, + annotationColors = NA, + annotationLegend = TRUE, + annotationNamesRow = TRUE, + annotationNamesCol = TRUE, + dropLevels = TRUE, + showRownames = TRUE, + showColnames = TRUE, + main = NA, + fontSize = 10, + fontSizeRow = fontSize, + fontSizeCol = fontSize, + displayNumbers = FALSE, + numberFormat = "\%.2f", + numberColor = "grey30", + fontSizeNumber = 0.8 * fontSize, + gapsRow = NULL, + gapsCol = NULL, + labelsRow = NULL, + labelsCol = NULL, + fileName = NA, + width = NA, + height = NA, + silent = FALSE, + rowLabel, + colLabel, + rowGroupOrder = NULL, + colGroupOrder = NULL, + ... +) } \arguments{ \item{mat}{numeric matrix of the values to be plotted.} diff --git a/man/simulateCellscelda_C.Rd b/man/simulateCellscelda_C.Rd index 75aa0b6e..d366729d 100644 --- a/man/simulateCellscelda_C.Rd +++ b/man/simulateCellscelda_C.Rd @@ -4,9 +4,18 @@ \alias{simulateCellscelda_C} \title{Simulate cells from the celda_C model} \usage{ -simulateCellscelda_C(model, S = 5, CRange = c(50, 100), - NRange = c(500, 1000), G = 100, K = 5, alpha = 1, beta = 1, - seed = 12345, ...) +simulateCellscelda_C( + model, + S = 5, + CRange = c(50, 100), + NRange = c(500, 1000), + G = 100, + K = 5, + alpha = 1, + beta = 1, + seed = 12345, + ... +) } \arguments{ \item{model}{Character. Options available in `celda::availableModels`.} diff --git a/man/simulateCellscelda_CG.Rd b/man/simulateCellscelda_CG.Rd index 2e6b167a..64f93062 100644 --- a/man/simulateCellscelda_CG.Rd +++ b/man/simulateCellscelda_CG.Rd @@ -4,9 +4,21 @@ \alias{simulateCellscelda_CG} \title{Simulate cells from the celda_CG model} \usage{ -simulateCellscelda_CG(model, S = 5, CRange = c(50, 100), - NRange = c(500, 1000), G = 100, K = 5, L = 10, alpha = 1, - beta = 1, gamma = 5, delta = 1, seed = 12345, ...) +simulateCellscelda_CG( + model, + S = 5, + CRange = c(50, 100), + NRange = c(500, 1000), + G = 100, + K = 5, + L = 10, + alpha = 1, + beta = 1, + gamma = 5, + delta = 1, + seed = 12345, + ... +) } \arguments{ \item{model}{Character. Options available in `celda::availableModels`.} diff --git a/man/simulateCellscelda_G.Rd b/man/simulateCellscelda_G.Rd index 275bba79..3ecd1aae 100644 --- a/man/simulateCellscelda_G.Rd +++ b/man/simulateCellscelda_G.Rd @@ -4,8 +4,18 @@ \alias{simulateCellscelda_G} \title{Simulate cells from the celda_G model} \usage{ -simulateCellscelda_G(model, C = 100, NRange = c(500, 1000), G = 100, - L = 10, beta = 1, gamma = 5, delta = 1, seed = 12345, ...) +simulateCellscelda_G( + model, + C = 100, + NRange = c(500, 1000), + G = 100, + L = 10, + beta = 1, + gamma = 5, + delta = 1, + seed = 12345, + ... +) } \arguments{ \item{model}{Character. Options available in `celda::availableModels`.} diff --git a/man/simulateContaminatedMatrix.Rd b/man/simulateContaminatedMatrix.Rd index adee2589..db39fcc9 100644 --- a/man/simulateContaminatedMatrix.Rd +++ b/man/simulateContaminatedMatrix.Rd @@ -4,8 +4,15 @@ \alias{simulateContaminatedMatrix} \title{Simulate contaminated count matrix} \usage{ -simulateContaminatedMatrix(C = 300, G = 100, K = 3, NRange = c(500, - 1000), beta = 0.5, delta = c(1, 2), seed = 12345) +simulateContaminatedMatrix( + C = 300, + G = 100, + K = 3, + NRange = c(500, 1000), + beta = 0.5, + delta = c(1, 2), + seed = 12345 +) } \arguments{ \item{C}{Integer. Number of cells to be simulated. Default to be 300.} diff --git a/man/topRank.Rd b/man/topRank.Rd index 3b78ca01..355be1e7 100644 --- a/man/topRank.Rd +++ b/man/topRank.Rd @@ -4,8 +4,7 @@ \alias{topRank} \title{Identify features with the highest influence on clustering.} \usage{ -topRank(matrix, n = 25, margin = 2, threshold = 0, - decreasing = TRUE) +topRank(matrix, n = 25, margin = 2, threshold = 0, decreasing = TRUE) } \arguments{ \item{matrix}{Numeric matrix.} diff --git a/tests/testthat/test-decon.R b/tests/testthat/test-decon.R index d62b9745..6daab64f 100644 --- a/tests/testthat/test-decon.R +++ b/tests/testthat/test-decon.R @@ -35,13 +35,11 @@ test_that(desc = "Testing simulateContaminatedMatrix", { }) ## DecontX -test_that(desc = "Testing DecontX", { - expect_equal(ncol(deconSim$observedCounts) + ncol(deconSim2$observedCounts), - ncol(batchDecontX$resList$estNativeCounts)) - # expect_equal(length(batchDecontX$resList$estConp) , - # ncol(batchDecontX$resList$estNativeCounts)) - expect_equal(batchDecontXBg$method, "background") -}) +#test_that(desc = "Testing DecontX", { +# expect_equal(ncol(deconSim$observedCounts) + ncol(deconSim2$observedCounts), +# ncol(batchDecontX$resList$estNativeCounts)) +# expect_equal(batchDecontXBg$method, "background") +#}) ## .decontXoneBatch test_that(desc = "Testing .decontXoneBatch", { @@ -63,7 +61,7 @@ test_that(desc = "Testing .decontXoneBatch", { expect_error(.decontXoneBatch(counts = deconSim$observedCounts, z = rep(1, ncol( deconSim$observedCounts))), - "'z' must have at least 2 different values.") + "No need to decontaminate when only one cluster is in the dataset.") countsNA <- deconSim$observedCounts countsNA[1, 1] <- NA expect_error(.decontXoneBatch(counts = countsNA, z = deconSim$z), From 10b9d696b62919d6f7a75b31361c415eab529ebe Mon Sep 17 00:00:00 2001 From: Irisapo Date: Thu, 5 Dec 2019 18:18:27 -0500 Subject: [PATCH 085/149] bug fix --- R/decon.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/decon.R b/R/decon.R index 294f1b2a..b79e85cc 100644 --- a/R/decon.R +++ b/R/decon.R @@ -580,7 +580,7 @@ decontX <- function(counts, if (deconMethod == "background") { ## Initialize cell label - initialLabel <- .decontxInitializeZ(counts = counts) + initialLabel <- .decontxInitializeZ(object= counts) globalZ <- initialLabel$globalZ cbZ <- initialLabel$cbZ trZ <- initialLabel$trZ From 1fd5977a653b1cc819a1667ad24ed0ae1a34ca57 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Sat, 7 Dec 2019 15:09:02 -0500 Subject: [PATCH 086/149] remove deprecated background-case code in decontx --- R/decon.R | 108 ++++++++---------------------------------------------- 1 file changed, 15 insertions(+), 93 deletions(-) diff --git a/R/decon.R b/R/decon.R index b79e85cc..5af9c244 100644 --- a/R/decon.R +++ b/R/decon.R @@ -147,6 +147,7 @@ simulateContaminatedMatrix <- function(C = 300, return(ll) } +# DEPRECATED. This is not used, but is kept as it might be useful in the future. # This function calculates the log-likelihood of background distribution # decontamination # bgDist Numeric matrix. Rows represent feature and columns are the times that @@ -207,7 +208,7 @@ simulateContaminatedMatrix <- function(C = 300, )) } - +# DEPRECATED. This is not used, but is kept as it might be useful in the feature. # This function updates decontamination using background distribution .cDCalcEMbgDecontamination <- function(counts, globalZ, cbZ, trZ, phi, eta, theta) { @@ -330,7 +331,7 @@ decontX <- function(counts, verbose = TRUE, varGenes = NULL, dbscanEps = NULL) { - # empty expression genes won't be used for estimation + ## Empty expression genes won't be used for estimation haveEmptyGenes <- FALSE totalGenes <- nrow(counts) noneEmptyGeneIndex <- rowSums(counts) != 0 @@ -407,7 +408,7 @@ decontX <- function(counts, )) } - # when there is only one batch + ## When there is only one batch resultsOneBatch <- .decontXoneBatch( counts = counts, z = z, @@ -464,6 +465,7 @@ decontX <- function(counts, append = TRUE, verbose = verbose ) + ## Always uses clusters for DecontX estimation #deconMethod <- "background" deconMethod <- "clustering" @@ -473,9 +475,8 @@ decontX <- function(counts, z <- .decontxInitializeZ(object = counts, varGenes = varGenes, dbscanEps = dbscanEps) - } else { - deconMethod <- "clustering" } + z <- .processCellLabels(z, numCells = nC) K <- length(unique(z)) @@ -578,83 +579,6 @@ decontX <- function(counts, } } - if (deconMethod == "background") { - ## Initialize cell label - initialLabel <- .decontxInitializeZ(object= counts) - globalZ <- initialLabel$globalZ - cbZ <- initialLabel$cbZ - trZ <- initialLabel$trZ - - ## Initialization - deltaInit <- delta - theta <- - stats::rbeta(n = nC, - shape1 = deltaInit, - shape2 = deltaInit) - estRmat <- t(t(counts) * theta) - - phi <- .colSumByGroupNumeric(estRmat, cbZ, max(cbZ)) - eta <- - rowSums(phi) - .colSumByGroupNumeric(phi, trZ, max(trZ)) - phi <- - normalizeCounts(phi, - normalize = "proportion", - pseudocountNormalize = 1e-20) - eta <- - normalizeCounts(eta, - normalize = "proportion", - pseudocountNormalize = 1e-20) - - ll <- c() - - llRound <- .bgCalcLL( - counts = counts, - globalZ = globalZ, - cbZ = cbZ, - phi = phi, - eta = eta, - theta = theta - ) - - ## EM updates - while (iter <= maxIter & - numIterWithoutImprovement <= stopIter) { - nextDecon <- .cDCalcEMbgDecontamination( - counts = counts, - globalZ = globalZ, - cbZ = cbZ, - trZ = trZ, - phi = phi, - eta = eta, - theta = theta - ) - - theta <- nextDecon$theta - phi <- nextDecon$phi - eta <- nextDecon$eta - delta <- nextDecon$delta - - ## Calculate log-likelihood - llTemp <- - .bgCalcLL( - counts = counts, - globalZ = globalZ, - cbZ = cbZ, - phi = phi, - eta = eta, - theta = theta - ) - ll <- c(ll, llTemp) - llRound <- c(llRound, round(llTemp, 2)) - - if (round(llTemp, 2) > llRound[iter] | iter == 1) { - numIterWithoutImprovement <- 1L - } else { - numIterWithoutImprovement <- numIterWithoutImprovement + 1L - } - iter <- iter + 1L - } - } resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) @@ -699,10 +623,6 @@ decontX <- function(counts, "theta" = theta, "delta" = delta ) - # if( deconMethod=="clustering" ) { - # posterior.params = list( "est.GeneDist"=phi, "est.ConDist"=eta ) - # resList = append( resList , posterior.params ) - # } return(list( "runParams" = runParams, @@ -780,27 +700,29 @@ addLogLikelihood <- function(llA, llB) { sce <- SingleCellExperiment::SingleCellExperiment(assays = list(counts= object)) } - # add the log2 normalized counts into sce object + ## Add the log2 normalized counts into sce object + ## The normalized counts is also centered using library size in the original count matrix + ## in scater::normalizeSCE() sce <- suppressWarnings(scater::normalizeSCE(sce)) if (nrow(sce) <= varGenes) { topVariableGenes <- 1:nrow(sce) } else if( nrow(sce) > varGenes ) { - # Use the top most variable genes to do rough clustering (celda_CG & Louvian graph algorithm) - mvTrend <- scran::trendVar(sce, use.spikes=FALSE) - decomposeTrend <- scran::decomposeVar(sce, mvTrend) - topVariableGenes <- order(decomposeTrend$bio, decreasing=TRUE)[1:varGenes] + ## Use the top most variable genes to do rough clustering (celda_CG & Louvian graph algorithm) + mvTrend <- scran::trendVar(sce, use.spikes=FALSE) + decomposeTrend <- scran::decomposeVar(sce, mvTrend) + topVariableGenes <- order(decomposeTrend$bio, decreasing=TRUE)[1:varGenes] } countsFiltered <- as.matrix(SingleCellExperiment::counts(sce[topVariableGenes, ])) storage.mode(countsFiltered) <- "integer" - # Celda clustering using recursive module splitting + ## Celda clustering using recursive module splitting L = min(L, nrow(countsFiltered)) initial.module.split <- recursiveSplitModule(countsFiltered, initialL=L, maxL=L, perplexity=FALSE, verbose=FALSE) initial.modules.model <- subsetCeldaList(initial.module.split, list(L=L)) - # Louvian community detection + ## Louvian community detection fm <- factorizeMatrix(countsFiltered, initial.modules.model, type="counts") resUmap <- uwot::umap(t(sqrt(fm$counts$cell)), n_neighbors=15, min_dist = 0.01, spread = 1) From 1358122dfa0ca1ce3a49580229bf3487817d5550 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Sat, 7 Dec 2019 15:57:04 -0500 Subject: [PATCH 087/149] cleaning code in decontx --- R/decon.R | 61 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/R/decon.R b/R/decon.R index 5af9c244..cfdc85af 100644 --- a/R/decon.R +++ b/R/decon.R @@ -266,8 +266,17 @@ simulateContaminatedMatrix <- function(C = 300, #' @param verbose Logical. Whether to print log messages. Default TRUE. #' @param varGenes Positive Integer. Used only when z is not provided. #' Need to be larger than 1. Default value is 5000 if not provided. +#' varGenes, being the number of most variable genes, is used to filter genes +#' based on the variability of gene's expression cross cells. While the variability +#' is calcualted using scran::trendVar() and scran::decomposeVar(). +#' @param L Positive Integer. Used only when z is not provided. +#' Need to be larger than 1. Default value is 50 if not provided. +#' L, being the number of gene modules, is used on celda_CG clustering +#' to collapse genes into gene modules. #' @param dbscanEps Numeric. Used only when z is not provided. #' Need to be non-negative. Default is 1.0 if not provided. +#' dbscanEps is the clustering resolution parameter that is used to feed into +#' dbscan::dbscan() to estimate broad cell clusters. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. @@ -292,6 +301,7 @@ decontX <- function(counts, logfile = NULL, verbose = TRUE, varGenes = NULL, + L = NULL, dbscanEps = NULL, seed = 12345) { @@ -304,6 +314,7 @@ decontX <- function(counts, logfile = logfile, verbose = verbose, varGenes = varGenes, + L = L, dbscanEps = dbscanEps) } else { with_seed(seed, @@ -315,6 +326,7 @@ decontX <- function(counts, logfile = logfile, verbose = verbose, varGenes = varGenes, + L = L, dbscanEps = dbscanEps)) } @@ -330,7 +342,9 @@ decontX <- function(counts, logfile = NULL, verbose = TRUE, varGenes = NULL, - dbscanEps = NULL) { + dbscanEps = NULL, + L = NULL) { + ## Empty expression genes won't be used for estimation haveEmptyGenes <- FALSE totalGenes <- nrow(counts) @@ -341,6 +355,9 @@ decontX <- function(counts, haveEmptyGenes <- TRUE } + nC = ncol(counts) + allCellNames = colnames(counts) + if (!is.null(batch)) { ## Set result lists upfront for all cells from different batches logLikelihood <- c() @@ -348,19 +365,19 @@ decontX <- function(counts, 0, ncol = ncol(counts), nrow = totalGenes, - dimnames = list(geneNames, colnames(counts)) + dimnames = list(geneNames, allCellNames) ) - theta <- rep(NA, ncol(counts)) - estConp <- rep(NA, ncol(counts)) + theta <- rep(NA, nC) + estConp <- rep(NA, nC) + returnZ <- rep(NA, nC) batchIndex <- unique(batch) for (bat in batchIndex) { + zBat <- NULL countsBat <- counts[, batch == bat] if (!is.null(z)) { zBat <- z[batch == bat] - } else { - zBat <- z } resBat <- .decontXoneBatch( counts = countsBat, @@ -371,7 +388,8 @@ decontX <- function(counts, logfile = logfile, verbose = verbose, varGenes = varGenes, - dbscanEps = dbscanEps + dbscanEps = dbscanEps, + L = L ) if (haveEmptyGenes) { @@ -383,6 +401,7 @@ decontX <- function(counts, } estConp[batch == bat] <- resBat$resList$estConp theta[batch == bat] <- resBat$resList$theta + returnZ[batch == bat] <- resBat$runParams$z if (is.null(logLikelihood)) { logLikelihood <- resBat$resList$logLikelihood @@ -393,6 +412,9 @@ decontX <- function(counts, } runParams <- resBat$runParams + ## All batches share the same other parameters except cluster label z + ## So update z in the final returned result + runParams$z <- returnZ method <- resBat$method resList <- list( "logLikelihood" = logLikelihood, @@ -417,11 +439,12 @@ decontX <- function(counts, logfile = logfile, verbose = verbose, varGenes = varGenes, - dbscanEps = dbscanEps + dbscanEps = dbscanEps, + L = L ) if (haveEmptyGenes) { - resBat <- matrix(0, nrow = totalGenes, ncol = ncol(counts), - dimnames = list(geneNames, colnames(counts))) + resBat <- matrix(0, nrow = totalGenes, ncol = nC, + dimnames = list(geneNames, allCellNames)) resBat[noneEmptyGeneIndex, ] <- resultsOneBatch$resList$estNativeCounts resultsOneBatch$resList$estNativeCounts <- resBat } @@ -439,7 +462,8 @@ decontX <- function(counts, logfile = NULL, verbose = TRUE, varGenes = NULL, - dbscanEps = NULL) { + dbscanEps = NULL, + L = NULL) { .checkCountsDecon(counts) .checkParametersDecon(proportionPrior = delta) @@ -471,6 +495,7 @@ decontX <- function(counts, varGenes = .processvarGenes(varGenes) dbscanEps = .processdbscanEps(dbscanEps) + L = .processL(L) z <- .decontxInitializeZ(object = counts, varGenes = varGenes, @@ -694,7 +719,7 @@ addLogLikelihood <- function(llA, llB) { function(object, # object is either a sce object or a count matrix varGenes = 5000, L = 50, - dbscanEps = 1) { + dbscanEps = 1.0) { if (!is(object, "SingleCellExperiment")) { sce <- SingleCellExperiment::SingleCellExperiment(assays = list(counts= object)) @@ -761,3 +786,15 @@ addLogLikelihood <- function(llA, llB) { } return(dbscanEps) } + +## process gene modules L +.processL = function(L) { + if (is.null(L)) { + L = 50 + } else { + if (L < 2 | !is.integer(L)) { + stop("Parameter 'L' must be an integer and larger than 1.") + } + } + return(L) +} From 4d864fd440210f0f1bc502d750c27110a78c1954 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Sat, 7 Dec 2019 19:52:04 -0500 Subject: [PATCH 088/149] update logMessages in DecontX --- R/decon.R | 205 ++++++++++++++++++++++++++++++++++--------------- man/decontX.Rd | 15 +++- 2 files changed, 155 insertions(+), 65 deletions(-) diff --git a/R/decon.R b/R/decon.R index cfdc85af..bba4d4fa 100644 --- a/R/decon.R +++ b/R/decon.R @@ -358,6 +358,18 @@ decontX <- function(counts, nC = ncol(counts) allCellNames = colnames(counts) + .logMessages( + paste(rep("-", 50), collapse = ""), + "\n", + "Starting DecontX. Decontamination", + "\n", + paste(rep("-", 50), collapse = ""), + sep = "", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + if (!is.null(batch)) { ## Set result lists upfront for all cells from different batches logLikelihood <- c() @@ -374,6 +386,21 @@ decontX <- function(counts, batchIndex <- unique(batch) for (bat in batchIndex) { + .logMessages( + paste(rep(" ", 4), collapse = ""), + paste(rep("-", 50), collapse = ""), + "\n", + paste(rep(" ", 4), collapse = ""), + "Estimate contamination within batch", + "\n", + paste(rep(" ", 4), collapse = ""), + paste(rep("-", 50), collapse = ""), + sep = "", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + zBat <- NULL countsBat <- counts[, batch == bat] if (!is.null(z)) { @@ -448,6 +475,24 @@ decontX <- function(counts, resBat[noneEmptyGeneIndex, ] <- resultsOneBatch$resList$estNativeCounts resultsOneBatch$resList$estNativeCounts <- resBat } + + zMessage = "" + if (is.null(z)) { + zMessage = "\nEstimated cell clusters z is saved in the result as well." + } + .logMessages( + paste(rep("-", 50), collapse = ""), + "\n", + "All is done", + zMessage, + "\n", + paste(rep("-", 50), collapse = ""), + sep = "", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + return(resultsOneBatch) } @@ -472,19 +517,18 @@ decontX <- function(counts, if (is.null(z)) { .logMessages( + paste(rep(" ", 8), collapse = ""), paste(rep("-", 50), collapse = ""), - logfile = logfile, - append = TRUE, - verbose = verbose - ) - .logMessages( - "Clustering using graph and density-based method to find cell clusters", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - .logMessages( + "\n", + paste(rep(" ", 8), collapse = ""), + "Start to estimate broad cell types", + "\n", + paste(rep(" ", 8), collapse = ""), + "which will then be used for DecontX contamination estimation.", + "\n", + paste(rep(" ", 8), collapse = ""), paste(rep("-", 50), collapse = ""), + sep = "", logfile = logfile, append = TRUE, verbose = verbose @@ -499,7 +543,9 @@ decontX <- function(counts, z <- .decontxInitializeZ(object = counts, varGenes = varGenes, - dbscanEps = dbscanEps) + dbscanEps = dbscanEps, + verbose = verbose, + logfile = logfile) } z <- .processCellLabels(z, numCells = nC) @@ -510,30 +556,15 @@ decontX <- function(counts, stopIter <- 3L .logMessages( + paste(rep(" ", 8), collapse = ""), paste(rep("-", 50), collapse = ""), - logfile = logfile, - append = TRUE, - verbose = verbose - ) - .logMessages( - "Start DecontX. Decontamination", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - - if (!is.null(batch)) { - .logMessages( - "batch: ", - batch, - logfile = logfile, - append = TRUE, - verbose = verbose - ) - } - - .logMessages( + "\n", + paste(rep(" ", 8), collapse = ""), + "Estimate contamination", + "\n", + paste(rep(" ", 8), collapse = ""), paste(rep("-", 50), collapse = ""), + sep = "", logfile = logfile, append = TRUE, verbose = verbose @@ -608,30 +639,26 @@ decontX <- function(counts, resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) endTime <- Sys.time() - .logMessages( - paste(rep("-", 50), collapse = ""), - logfile = logfile, - append = TRUE, - verbose = verbose - ) - .logMessages( - "Completed DecontX. Total time:", - format(difftime(endTime, startTime)), - logfile = logfile, - append = TRUE, - verbose = verbose - ) if (!is.null(batch)) { - .logMessages( - "batch: ", - batch, - logfile = logfile, - append = TRUE, - verbose = verbose - ) + batchMessage = paste(" ", "in batch ", batch, ".", sep = "") + } else { + batchMessage = "." } .logMessages( + paste(rep(" ", 8), collapse = ""), + paste(rep("-", 50), collapse = ""), + "\n", + paste(rep(" ", 8), collapse = ""), + "Contamination estimation is completed", + batchMessage, + "\n", + paste(rep(" ", 8), collapse = ""), + "DecontX time: ", + format(difftime(endTime, startTime)), + "\n", + paste(rep(" ", 8), collapse = ""), paste(rep("-", 50), collapse = ""), + sep = "", logfile = logfile, append = TRUE, verbose = verbose @@ -719,7 +746,9 @@ addLogLikelihood <- function(llA, llB) { function(object, # object is either a sce object or a count matrix varGenes = 5000, L = 50, - dbscanEps = 1.0) { + dbscanEps = 1.0, + verbose = TRUE, + logfile = NULL) { if (!is(object, "SingleCellExperiment")) { sce <- SingleCellExperiment::SingleCellExperiment(assays = list(counts= object)) @@ -741,16 +770,64 @@ addLogLikelihood <- function(llA, llB) { countsFiltered <- as.matrix(SingleCellExperiment::counts(sce[topVariableGenes, ])) storage.mode(countsFiltered) <- "integer" + .logMessages( + paste(rep(" ", 12), collapse = ""), + paste(rep("-", 50), collapse = ""), + "\n", + paste(rep(" ", 12), collapse = ""), + "Collapse genes into ", + L, + " gene modules", + "\n", + paste(rep(" ", 12), collapse = ""), + paste(rep("-", 50), collapse = ""), + sep = "", + logfile = logfile, + append = TRUE, + verbose = verbose + ) ## Celda clustering using recursive module splitting - L = min(L, nrow(countsFiltered)) - initial.module.split <- recursiveSplitModule(countsFiltered, initialL=L, maxL=L, perplexity=FALSE, verbose=FALSE) - initial.modules.model <- subsetCeldaList(initial.module.split, list(L=L)) - - - ## Louvian community detection - fm <- factorizeMatrix(countsFiltered, initial.modules.model, type="counts") - resUmap <- uwot::umap(t(sqrt(fm$counts$cell)), n_neighbors=15, min_dist = 0.01, spread = 1) + if (L < nrow(countsFiltered)) { + initial.module.split <- recursiveSplitModule(countsFiltered, initialL=L, maxL=L, perplexity=FALSE, verbose=FALSE) + initial.modules.model <- subsetCeldaList(initial.module.split, list(L=L)) + fm <- factorizeMatrix(countsFiltered, initial.modules.model, type="counts")$counts$cell + } else { + fm = countsFiltered + } + + .logMessages( + paste(rep(" ", 12), collapse = ""), + paste(rep("-", 50), collapse = ""), + "\n", + paste(rep(" ", 12), collapse = ""), + "Use umap to reduce features into 2 dimensions for cell community detection", + "\n", + paste(rep(" ", 12), collapse = ""), + paste(rep("-", 50), collapse = ""), + sep = "", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + ## Umap to reduce dimension into 2 cluster + nNeighbors = min(15, ncol(countsFiltered)) + resUmap <- uwot::umap(t(sqrt(fm)), n_neighbors=nNeighbors, min_dist = 0.01, spread = 1) + rm(fm) + .logMessages( + paste(rep(" ", 12), collapse = ""), + paste(rep("-", 50), collapse = ""), + "\n", + paste(rep(" ", 12), collapse = ""), + "Use density-based model DBSCAN to detect cell community", + "\n", + paste(rep(" ", 12), collapse = ""), + paste(rep("-", 50), collapse = ""), + sep = "", + logfile = logfile, + append = TRUE, + verbose = verbose + ) # Use dbSCAN on the UMAP to identify broad cell types totalClusters <- 1 while(totalClusters <= 1 & dbscanEps > 0) { @@ -798,3 +875,5 @@ addLogLikelihood <- function(llA, llB) { } return(L) } + + diff --git a/man/decontX.Rd b/man/decontX.Rd index 145f06c6..7217fd4f 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -13,6 +13,7 @@ decontX( logfile = NULL, verbose = TRUE, varGenes = NULL, + L = NULL, dbscanEps = NULL, seed = 12345 ) @@ -37,10 +38,20 @@ to be 10.} \item{verbose}{Logical. Whether to print log messages. Default TRUE.} \item{varGenes}{Positive Integer. Used only when z is not provided. -Need to be larger than 1. Default value is 5000 if not provided.} +Need to be larger than 1. Default value is 5000 if not provided. +varGenes, being the number of most variable genes, is used to filter genes +based on the variability of gene's expression cross cells. While the variability +is calcualted using scran::trendVar() and scran::decomposeVar().} + +\item{L}{Positive Integer. Used only when z is not provided. +Need to be larger than 1. Default value is 50 if not provided. +L, being the number of gene modules, is used on celda_CG clustering +to collapse genes into gene modules.} \item{dbscanEps}{Numeric. Used only when z is not provided. -Need to be non-negative. Default is 1.0 if not provided.} +Need to be non-negative. Default is 1.0 if not provided. +dbscanEps is the clustering resolution parameter that is used to feed into +dbscan::dbscan() to estimate broad cell clusters.} \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to From 7a7831f2347b204b0865186eb1dc5b95093439ba Mon Sep 17 00:00:00 2001 From: Irisapo Date: Sat, 7 Dec 2019 20:04:54 -0500 Subject: [PATCH 089/149] bug fixed --- R/decon.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/decon.R b/R/decon.R index bba4d4fa..f66eae44 100644 --- a/R/decon.R +++ b/R/decon.R @@ -514,6 +514,7 @@ decontX <- function(counts, # nG <- nrow(counts) nC <- ncol(counts) + deconMethod <- "clustering" if (is.null(z)) { .logMessages( @@ -535,7 +536,6 @@ decontX <- function(counts, ) ## Always uses clusters for DecontX estimation #deconMethod <- "background" - deconMethod <- "clustering" varGenes = .processvarGenes(varGenes) dbscanEps = .processdbscanEps(dbscanEps) @@ -678,8 +678,7 @@ decontX <- function(counts, return(list( "runParams" = runParams, - "resList" = resList, - "method" = deconMethod + "resList" = resList )) } From 525283118ea24c9a265898c743d013fcc43057f1 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Sun, 8 Dec 2019 04:15:27 -0500 Subject: [PATCH 090/149] logMessage for decontX --- R/decon.R | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/R/decon.R b/R/decon.R index f66eae44..a1b2b9e0 100644 --- a/R/decon.R +++ b/R/decon.R @@ -391,7 +391,8 @@ decontX <- function(counts, paste(rep("-", 50), collapse = ""), "\n", paste(rep(" ", 4), collapse = ""), - "Estimate contamination within batch", + "Estimate contamination within batch ", + bat, "\n", paste(rep(" ", 4), collapse = ""), paste(rep("-", 50), collapse = ""), @@ -450,6 +451,19 @@ decontX <- function(counts, "theta" = theta ) + .logMessages( + paste(rep("-", 50), collapse = ""), + "\n", + "All is done", + zMessage, + "\n", + paste(rep("-", 50), collapse = ""), + sep = "", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + return(list( "runParams" = runParams, "resList" = resList, From 9c47f2c9805cf8fc7d0898bacf9f6c347cb083de Mon Sep 17 00:00:00 2001 From: Irisapo Date: Sun, 8 Dec 2019 04:31:14 -0500 Subject: [PATCH 091/149] bug fixed for decontx --- R/decon.R | 56 +++++++++++++++++++++---------------------------------- 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/R/decon.R b/R/decon.R index a1b2b9e0..d2278f9c 100644 --- a/R/decon.R +++ b/R/decon.R @@ -451,43 +451,29 @@ decontX <- function(counts, "theta" = theta ) - .logMessages( - paste(rep("-", 50), collapse = ""), - "\n", - "All is done", - zMessage, - "\n", - paste(rep("-", 50), collapse = ""), - sep = "", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - - return(list( + returnResult <- list( "runParams" = runParams, "resList" = resList, "method" = method - )) - } - - ## When there is only one batch - resultsOneBatch <- .decontXoneBatch( - counts = counts, - z = z, - maxIter = maxIter, - delta = delta, - logfile = logfile, - verbose = verbose, - varGenes = varGenes, - dbscanEps = dbscanEps, - L = L - ) - if (haveEmptyGenes) { - resBat <- matrix(0, nrow = totalGenes, ncol = nC, - dimnames = list(geneNames, allCellNames)) - resBat[noneEmptyGeneIndex, ] <- resultsOneBatch$resList$estNativeCounts - resultsOneBatch$resList$estNativeCounts <- resBat + ) + } else { ## When there is only one batch + returnResult <- .decontXoneBatch( + counts = counts, + z = z, + maxIter = maxIter, + delta = delta, + logfile = logfile, + verbose = verbose, + varGenes = varGenes, + dbscanEps = dbscanEps, + L = L + ) + if (haveEmptyGenes) { + resBat <- matrix(0, nrow = totalGenes, ncol = nC, + dimnames = list(geneNames, allCellNames)) + resBat[noneEmptyGeneIndex, ] <- returnResult$resList$estNativeCounts + returnResult$resList$estNativeCounts <- resBat + } } zMessage = "" @@ -507,7 +493,7 @@ decontX <- function(counts, verbose = verbose ) - return(resultsOneBatch) + return(returnResult) } From b5a08d8be0e2750a566be6b67c5712e613aa5f8e Mon Sep 17 00:00:00 2001 From: Irisapo Date: Mon, 9 Dec 2019 13:41:21 -0500 Subject: [PATCH 092/149] scater::normalizeSCE --> scater::logNormCounts --- R/decon.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/decon.R b/R/decon.R index d2278f9c..e044983a 100644 --- a/R/decon.R +++ b/R/decon.R @@ -756,7 +756,8 @@ addLogLikelihood <- function(llA, llB) { ## Add the log2 normalized counts into sce object ## The normalized counts is also centered using library size in the original count matrix ## in scater::normalizeSCE() - sce <- suppressWarnings(scater::normalizeSCE(sce)) + #sce <- suppressWarnings(scater::normalizeSCE(sce)) + sce <- scater::logNormCounts(sce, log=TRUE) if (nrow(sce) <= varGenes) { topVariableGenes <- 1:nrow(sce) @@ -808,7 +809,7 @@ addLogLikelihood <- function(llA, llB) { append = TRUE, verbose = verbose ) - ## Umap to reduce dimension into 2 cluster + ## Louvan graph-based method to reduce dimension into 2 cluster nNeighbors = min(15, ncol(countsFiltered)) resUmap <- uwot::umap(t(sqrt(fm)), n_neighbors=nNeighbors, min_dist = 0.01, spread = 1) rm(fm) From df4ebc589daf6e8b19e8d9279c6456ffb674f36a Mon Sep 17 00:00:00 2001 From: Irisapo Date: Mon, 9 Dec 2019 18:42:47 -0500 Subject: [PATCH 093/149] formaaaaaaaaaaaat --- R/decon.R | 104 +++++++++++++++++++++++++++--------------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/R/decon.R b/R/decon.R index e044983a..785f3227 100644 --- a/R/decon.R +++ b/R/decon.R @@ -276,7 +276,7 @@ simulateContaminatedMatrix <- function(C = 300, #' @param dbscanEps Numeric. Used only when z is not provided. #' Need to be non-negative. Default is 1.0 if not provided. #' dbscanEps is the clustering resolution parameter that is used to feed into -#' dbscan::dbscan() to estimate broad cell clusters. +#' dbscan::dbscan() to estimate broad cell clusters. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. @@ -313,9 +313,9 @@ decontX <- function(counts, delta = delta, logfile = logfile, verbose = verbose, - varGenes = varGenes, + varGenes = varGenes, L = L, - dbscanEps = dbscanEps) + dbscanEps = dbscanEps) } else { with_seed(seed, res <- .decontX(counts = counts, @@ -325,9 +325,9 @@ decontX <- function(counts, delta = delta, logfile = logfile, verbose = verbose, - varGenes = varGenes, + varGenes = varGenes, L = L, - dbscanEps = dbscanEps)) + dbscanEps = dbscanEps)) } return(res) @@ -360,11 +360,11 @@ decontX <- function(counts, .logMessages( paste(rep("-", 50), collapse = ""), - "\n", + "\n", "Starting DecontX. Decontamination", - "\n", + "\n", paste(rep("-", 50), collapse = ""), - sep = "", + sep = "", logfile = logfile, append = TRUE, verbose = verbose @@ -381,21 +381,21 @@ decontX <- function(counts, ) theta <- rep(NA, nC) estConp <- rep(NA, nC) - returnZ <- rep(NA, nC) + returnZ <- rep(NA, nC) batchIndex <- unique(batch) for (bat in batchIndex) { .logMessages( paste(rep(" ", 4), collapse = ""), - paste(rep("-", 50), collapse = ""), + paste(rep("-", 50), collapse = ""), "\n", paste(rep(" ", 4), collapse = ""), "Estimate contamination within batch ", bat, "\n", paste(rep(" ", 4), collapse = ""), - paste(rep("-", 50), collapse = ""), + paste(rep("-", 50), collapse = ""), sep = "", logfile = logfile, append = TRUE, @@ -415,8 +415,8 @@ decontX <- function(counts, delta = delta, logfile = logfile, verbose = verbose, - varGenes = varGenes, - dbscanEps = dbscanEps, + varGenes = varGenes, + dbscanEps = dbscanEps, L = L ) @@ -440,9 +440,9 @@ decontX <- function(counts, } runParams <- resBat$runParams - ## All batches share the same other parameters except cluster label z - ## So update z in the final returned result - runParams$z <- returnZ + ## All batches share the same other parameters except cluster label z + ## So update z in the final returned result + runParams$z <- returnZ method <- resBat$method resList <- list( "logLikelihood" = logLikelihood, @@ -475,19 +475,19 @@ decontX <- function(counts, returnResult$resList$estNativeCounts <- resBat } } - - zMessage = "" + + zMessage <- "" if (is.null(z)) { - zMessage = "\nEstimated cell clusters z is saved in the result as well." + zMessage <- "\nEstimated cell clusters z is saved in the result as well." } .logMessages( paste(rep("-", 50), collapse = ""), - "\n", + "\n", "All is done", zMessage, - "\n", + "\n", paste(rep("-", 50), collapse = ""), - sep = "", + sep = "", logfile = logfile, append = TRUE, verbose = verbose @@ -537,9 +537,9 @@ decontX <- function(counts, ## Always uses clusters for DecontX estimation #deconMethod <- "background" - varGenes = .processvarGenes(varGenes) - dbscanEps = .processdbscanEps(dbscanEps) - L = .processL(L) + varGenes <- .processvarGenes(varGenes) + dbscanEps <- .processdbscanEps(dbscanEps) + L <- .processL(L) z <- .decontxInitializeZ(object = counts, varGenes = varGenes, @@ -640,9 +640,9 @@ decontX <- function(counts, endTime <- Sys.time() if (!is.null(batch)) { - batchMessage = paste(" ", "in batch ", batch, ".", sep = "") + batchMessage <- paste(" ", "in batch ", batch, ".", sep = "") } else { - batchMessage = "." + batchMessage <- "." } .logMessages( paste(rep(" ", 8), collapse = ""), @@ -658,7 +658,7 @@ decontX <- function(counts, "\n", paste(rep(" ", 8), collapse = ""), paste(rep("-", 50), collapse = ""), - sep = "", + sep = "", logfile = logfile, append = TRUE, verbose = verbose @@ -666,7 +666,7 @@ decontX <- function(counts, runParams <- list("deltaInit" = deltaInit, "iteration" = iter - 1L, - "z" = z) + "z" = z) resList <- list( "logLikelihood" = ll, @@ -745,27 +745,27 @@ addLogLikelihood <- function(llA, llB) { function(object, # object is either a sce object or a count matrix varGenes = 5000, L = 50, - dbscanEps = 1.0, + dbscanEps = 1.0, verbose = TRUE, logfile = NULL) { if (!is(object, "SingleCellExperiment")) { - sce <- SingleCellExperiment::SingleCellExperiment(assays = list(counts= object)) + sce <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = object)) } - ## Add the log2 normalized counts into sce object + ## Add the log2 normalized counts into sce object ## The normalized counts is also centered using library size in the original count matrix ## in scater::normalizeSCE() #sce <- suppressWarnings(scater::normalizeSCE(sce)) - sce <- scater::logNormCounts(sce, log=TRUE) + sce <- scater::logNormCounts(sce, log = TRUE) if (nrow(sce) <= varGenes) { topVariableGenes <- 1:nrow(sce) - } else if( nrow(sce) > varGenes ) { + } else if (nrow(sce) > varGenes) { ## Use the top most variable genes to do rough clustering (celda_CG & Louvian graph algorithm) - mvTrend <- scran::trendVar(sce, use.spikes=FALSE) + mvTrend <- scran::trendVar(sce, use.spikes = FALSE) decomposeTrend <- scran::decomposeVar(sce, mvTrend) - topVariableGenes <- order(decomposeTrend$bio, decreasing=TRUE)[1:varGenes] + topVariableGenes <- order(decomposeTrend$bio, decreasing = TRUE)[1:varGenes] } countsFiltered <- as.matrix(SingleCellExperiment::counts(sce[topVariableGenes, ])) storage.mode(countsFiltered) <- "integer" @@ -788,13 +788,13 @@ addLogLikelihood <- function(llA, llB) { ) ## Celda clustering using recursive module splitting if (L < nrow(countsFiltered)) { - initial.module.split <- recursiveSplitModule(countsFiltered, initialL=L, maxL=L, perplexity=FALSE, verbose=FALSE) - initial.modules.model <- subsetCeldaList(initial.module.split, list(L=L)) - fm <- factorizeMatrix(countsFiltered, initial.modules.model, type="counts")$counts$cell + initial.module.split <- recursiveSplitModule(countsFiltered, initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) + initial.modules.model <- subsetCeldaList(initial.module.split, list(L = L)) + fm <- factorizeMatrix(countsFiltered, initial.modules.model, type = "counts")$counts$cell } else { - fm = countsFiltered + fm <- countsFiltered } - + .logMessages( paste(rep(" ", 12), collapse = ""), paste(rep("-", 50), collapse = ""), @@ -810,9 +810,9 @@ addLogLikelihood <- function(llA, llB) { verbose = verbose ) ## Louvan graph-based method to reduce dimension into 2 cluster - nNeighbors = min(15, ncol(countsFiltered)) - resUmap <- uwot::umap(t(sqrt(fm)), n_neighbors=nNeighbors, min_dist = 0.01, spread = 1) - rm(fm) + nNeighbors <- min(15, ncol(countsFiltered)) + resUmap <- uwot::umap(t(sqrt(fm)), n_neighbors = nNeighbors, min_dist = 0.01, spread = 1) + rm(fm) .logMessages( paste(rep(" ", 12), collapse = ""), @@ -830,7 +830,7 @@ addLogLikelihood <- function(llA, llB) { ) # Use dbSCAN on the UMAP to identify broad cell types totalClusters <- 1 - while(totalClusters <= 1 & dbscanEps > 0) { + while (totalClusters <= 1 & dbscanEps > 0) { resDbscan <- dbscan::dbscan(resUmap, dbscanEps) dbscanEps <- dbscanEps - (0.25 * dbscanEps) totalClusters <- length(unique(resDbscan$cluster)) @@ -841,33 +841,33 @@ addLogLikelihood <- function(llA, llB) { ## process varGenes -.processvarGenes = function(varGenes) { +.processvarGenes <- function(varGenes) { if (is.null(varGenes)) { varGenes <- 5000 } else { if (varGenes < 2 | !is.integer(varGenes)) { - stop("Parameter 'varGenes' must be an integer and larger than 1.") - } + stop("Parameter 'varGenes' must be an integer and larger than 1.") + } } return(varGenes) } ## process dbscanEps for resolusion threshold using DBSCAN -.processdbscanEps = function(dbscanEps) { +.processdbscanEps <- function(dbscanEps) { if (is.null(dbscanEps)) { dbscanEps <- 1 } else { if (dbscanEps < 0) { stop("Parameter 'dbscanEps' needs to be non-negative.") - } + } } return(dbscanEps) } ## process gene modules L -.processL = function(L) { +.processL <- function(L) { if (is.null(L)) { - L = 50 + L <- 50 } else { if (L < 2 | !is.integer(L)) { stop("Parameter 'L' must be an integer and larger than 1.") From 7a879764a58b97dc7844e942cba4ed788b655753 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Tue, 10 Dec 2019 10:13:09 -0500 Subject: [PATCH 094/149] formaaaaaaaat --- R/decon.R | 39 ++++++++++++++++++++++--------------- man/decontX.Rd | 4 ++-- tests/testthat/test-decon.R | 5 ----- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/R/decon.R b/R/decon.R index 785f3227..935fcc07 100644 --- a/R/decon.R +++ b/R/decon.R @@ -208,7 +208,8 @@ simulateContaminatedMatrix <- function(C = 300, )) } -# DEPRECATED. This is not used, but is kept as it might be useful in the feature. +# DEPRECATED. This is not used, but is kept as it might be useful in the +# feature. # This function updates decontamination using background distribution .cDCalcEMbgDecontamination <- function(counts, globalZ, cbZ, trZ, phi, eta, theta) { @@ -267,8 +268,8 @@ simulateContaminatedMatrix <- function(C = 300, #' @param varGenes Positive Integer. Used only when z is not provided. #' Need to be larger than 1. Default value is 5000 if not provided. #' varGenes, being the number of most variable genes, is used to filter genes -#' based on the variability of gene's expression cross cells. While the variability -#' is calcualted using scran::trendVar() and scran::decomposeVar(). +#' based on the variability of gene's expression cross cells. While the +#' variability is calcualted using scran::trendVar() and scran::decomposeVar(). #' @param L Positive Integer. Used only when z is not provided. #' Need to be larger than 1. Default value is 50 if not provided. #' L, being the number of gene modules, is used on celda_CG clustering @@ -355,7 +356,7 @@ decontX <- function(counts, haveEmptyGenes <- TRUE } - nC = ncol(counts) + nC <- ncol(counts) allCellNames = colnames(counts) .logMessages( @@ -392,7 +393,7 @@ decontX <- function(counts, "\n", paste(rep(" ", 4), collapse = ""), "Estimate contamination within batch ", - bat, + bat, "\n", paste(rep(" ", 4), collapse = ""), paste(rep("-", 50), collapse = ""), @@ -475,7 +476,7 @@ decontX <- function(counts, returnResult$resList$estNativeCounts <- resBat } } - + zMessage <- "" if (is.null(z)) { zMessage <- "\nEstimated cell clusters z is saved in the result as well." @@ -754,17 +755,18 @@ addLogLikelihood <- function(llA, llB) { } ## Add the log2 normalized counts into sce object - ## The normalized counts is also centered using library size in the original count matrix - ## in scater::normalizeSCE() + ## The normalized counts is also centered using library size in the + ## original count matrix in scater::normalizeSCE() #sce <- suppressWarnings(scater::normalizeSCE(sce)) sce <- scater::logNormCounts(sce, log = TRUE) if (nrow(sce) <= varGenes) { - topVariableGenes <- 1:nrow(sce) - } else if (nrow(sce) > varGenes) { - ## Use the top most variable genes to do rough clustering (celda_CG & Louvian graph algorithm) + topVariableGenes <- seq_len(nrow(sce)) + } else if (nrow(sce) > varGenes) { + ## Use the top most variable genes to do rough clustering + ## (celda_CG & Louvian graph algorithm) mvTrend <- scran::trendVar(sce, use.spikes = FALSE) - decomposeTrend <- scran::decomposeVar(sce, mvTrend) + decomposeTrend <- scran::decomposeVar(sce, mvTrend) topVariableGenes <- order(decomposeTrend$bio, decreasing = TRUE)[1:varGenes] } countsFiltered <- as.matrix(SingleCellExperiment::counts(sce[topVariableGenes, ])) @@ -788,9 +790,13 @@ addLogLikelihood <- function(llA, llB) { ) ## Celda clustering using recursive module splitting if (L < nrow(countsFiltered)) { - initial.module.split <- recursiveSplitModule(countsFiltered, initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) - initial.modules.model <- subsetCeldaList(initial.module.split, list(L = L)) - fm <- factorizeMatrix(countsFiltered, initial.modules.model, type = "counts")$counts$cell + initialModuleSplit <- recursiveSplitModule(countsFiltered, + initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) + initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) + fm <- factorizeMatrix(countsFiltered, initialModel, type = "counts") + fm <- fm$counts$cell + rm(initialModuleSplit) + rm(initialModel) } else { fm <- countsFiltered } @@ -811,7 +817,8 @@ addLogLikelihood <- function(llA, llB) { ) ## Louvan graph-based method to reduce dimension into 2 cluster nNeighbors <- min(15, ncol(countsFiltered)) - resUmap <- uwot::umap(t(sqrt(fm)), n_neighbors = nNeighbors, min_dist = 0.01, spread = 1) + resUmap <- uwot::umap(t(sqrt(fm)), n_neighbors = nNeighbors, + min_dist = 0.01, spread = 1) rm(fm) .logMessages( diff --git a/man/decontX.Rd b/man/decontX.Rd index 7217fd4f..48e3c244 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -40,8 +40,8 @@ to be 10.} \item{varGenes}{Positive Integer. Used only when z is not provided. Need to be larger than 1. Default value is 5000 if not provided. varGenes, being the number of most variable genes, is used to filter genes -based on the variability of gene's expression cross cells. While the variability -is calcualted using scran::trendVar() and scran::decomposeVar().} +based on the variability of gene's expression cross cells. While the +variability is calcualted using scran::trendVar() and scran::decomposeVar().} \item{L}{Positive Integer. Used only when z is not provided. Need to be larger than 1. Default value is 50 if not provided. diff --git a/tests/testthat/test-decon.R b/tests/testthat/test-decon.R index 6daab64f..0ce35af2 100644 --- a/tests/testthat/test-decon.R +++ b/tests/testthat/test-decon.R @@ -35,11 +35,6 @@ test_that(desc = "Testing simulateContaminatedMatrix", { }) ## DecontX -#test_that(desc = "Testing DecontX", { -# expect_equal(ncol(deconSim$observedCounts) + ncol(deconSim2$observedCounts), -# ncol(batchDecontX$resList$estNativeCounts)) -# expect_equal(batchDecontXBg$method, "background") -#}) ## .decontXoneBatch test_that(desc = "Testing .decontXoneBatch", { From a62a91144edf72578105ffe24f079b5c7fb302c4 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Tue, 10 Dec 2019 10:40:31 -0500 Subject: [PATCH 095/149] format --- R/decon.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/R/decon.R b/R/decon.R index 935fcc07..bf47a441 100644 --- a/R/decon.R +++ b/R/decon.R @@ -357,7 +357,7 @@ decontX <- function(counts, } nC <- ncol(counts) - allCellNames = colnames(counts) + allCellNames <- colnames(counts) .logMessages( paste(rep("-", 50), collapse = ""), @@ -382,7 +382,7 @@ decontX <- function(counts, ) theta <- rep(NA, nC) estConp <- rep(NA, nC) - returnZ <- rep(NA, nC) + returnZ <- rep(NA, nC) batchIndex <- unique(batch) @@ -882,5 +882,3 @@ addLogLikelihood <- function(llA, llB) { } return(L) } - - From ff4630e977192cf705021a5d1c87a15820b870f5 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Tue, 10 Dec 2019 16:02:29 -0500 Subject: [PATCH 096/149] scater bump version 1.14.4 for decontx --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 476dc95a..8f11aa8b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,7 +46,7 @@ Imports: ggdendro, pROC, magrittr, - scater, + scater (>= 1.14.4), scran, SingleCellExperiment, dbscan From a59bbf0420969f160f5a30a1e5b2126d972899f1 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Tue, 10 Dec 2019 16:20:26 -0500 Subject: [PATCH 097/149] .colSumByGroup replaced by .colSumByGroupNumeric for integer issue decontx --- R/decon.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/decon.R b/R/decon.R index bf47a441..cb761ca0 100644 --- a/R/decon.R +++ b/R/decon.R @@ -105,7 +105,7 @@ simulateContaminatedMatrix <- function(C = 300, ## sample contamination count matrix nGByK <- - rowSums(cellRmat) - .colSumByGroup(cellRmat, group = z, K = K) + rowSums(cellRmat) - .colSumByGroupNumeric(cellRmat, group = z, K = K) eta <- normalizeCounts(counts = nGByK, normalize = "proportion") cellCmat <- vapply(seq(C), function(i) { From 16f3fa0995cc62d70d781b445cf48f0d613c5f07 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Wed, 11 Dec 2019 15:08:47 -0500 Subject: [PATCH 098/149] simulateContaminatedMatrix bug fixed --- R/decon.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/decon.R b/R/decon.R index cb761ca0..bf47a441 100644 --- a/R/decon.R +++ b/R/decon.R @@ -105,7 +105,7 @@ simulateContaminatedMatrix <- function(C = 300, ## sample contamination count matrix nGByK <- - rowSums(cellRmat) - .colSumByGroupNumeric(cellRmat, group = z, K = K) + rowSums(cellRmat) - .colSumByGroup(cellRmat, group = z, K = K) eta <- normalizeCounts(counts = nGByK, normalize = "proportion") cellCmat <- vapply(seq(C), function(i) { From f6aa6d6ddb181a84ed99abd893b07e7e0f4f75e9 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Thu, 12 Dec 2019 11:55:22 -0500 Subject: [PATCH 099/149] bump BiocManager version --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8f11aa8b..51bd4486 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -49,7 +49,8 @@ Imports: scater (>= 1.14.4), scran, SingleCellExperiment, - dbscan + dbscan, + BiocManager (>= 3.10) Suggests: testthat, knitr, @@ -60,7 +61,6 @@ Suggests: biomaRt, covr, M3DExampleData, - BiocManager, BiocStyle LinkingTo: Rcpp, RcppEigen License: MIT + file LICENSE From cce9b390a32b516610033863d3cf7393d2b45015 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Thu, 12 Dec 2019 12:20:55 -0500 Subject: [PATCH 100/149] bump BiocManager version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 51bd4486..2a8a8c32 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,7 +50,7 @@ Imports: scran, SingleCellExperiment, dbscan, - BiocManager (>= 3.10) + BiocManager (>= 1.30.10) Suggests: testthat, knitr, From 355405183f73f37a2b9028b4e27738b2b886beb3 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Thu, 12 Dec 2019 13:01:06 -0500 Subject: [PATCH 101/149] update DESCRIPTION, fix lints --- DESCRIPTION | 4 ++-- R/celda_C.R | 18 ++++++++++++------ R/decon.R | 19 ++++++++++++------- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2a8a8c32..8f11aa8b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -49,8 +49,7 @@ Imports: scater (>= 1.14.4), scran, SingleCellExperiment, - dbscan, - BiocManager (>= 1.30.10) + dbscan Suggests: testthat, knitr, @@ -61,6 +60,7 @@ Suggests: biomaRt, covr, M3DExampleData, + BiocManager, BiocStyle LinkingTo: Rcpp, RcppEigen License: MIT + file LICENSE diff --git a/R/celda_C.R b/R/celda_C.R index b6f4bfbf..9cdf2c06 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -460,17 +460,23 @@ celda_C <- function(counts, for (j in seq_len(K)) { #otherIx <- seq(K)[-j] if (j != z[i]) { # when j is not current population assignment - probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(lgamma(nGByCP[, j] + counts[, i] + beta)) - # if adding this cell -- Phi Numerator - lgamma(nCP[j] + nByC[i] + nG * beta) - # if adding this cell -- Phi Denominator - sum(lgamma(nGByCP[, j] + beta)) + # if without this cell -- Phi Numerator - lgamma(nCP[j] + nG * beta) # if without this cell -- Phi Denominator + ## Theta simplified + probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + + # if adding this cell -- Phi Numerator + sum(lgamma(nGByCP[, j] + counts[, i] + beta)) - + # if adding this cell -- Phi Denominator + lgamma(nCP[j] + nByC[i] + nG * beta) - + # if without this cell -- Phi Numerator + sum(lgamma(nGByCP[, j] + beta)) + + # if without this cell -- Phi Denominator + lgamma(nCP[j] + nG * beta) #sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) #nGByCP2[j] - ## Phi Numerator (current cell) #sum(nCP1[otherIx]) - ## Phi Denominator (other cells) #nCP2[j] - ## Phi Denominator (current cell) } else { # when j is current population assignment - probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified + ## Theta simplified + probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + sum(lgamma(nGByCP[, j] + beta)) - lgamma(nCP[j] + nG * beta) - sum(lgamma(nGByCP[, j] - counts[, i] + beta)) + diff --git a/R/decon.R b/R/decon.R index bf47a441..f95e8370 100644 --- a/R/decon.R +++ b/R/decon.R @@ -479,7 +479,8 @@ decontX <- function(counts, zMessage <- "" if (is.null(z)) { - zMessage <- "\nEstimated cell clusters z is saved in the result as well." + zMessage <- paste0("\nEstimated cell clusters z is saved in the", + " result as well.") } .logMessages( paste(rep("-", 50), collapse = ""), @@ -711,7 +712,8 @@ decontX <- function(counts, " 'counts' matrix.") } if (length(unique(z)) < 2) { - stop("No need to decontaminate when only one cluster is in the dataset.") # Even though + stop("No need to decontaminate when only one cluster", + " is in the dataset.") # Even though # everything runs smoothly when length(unique(z)) == 1, result is not # trustful } @@ -751,7 +753,8 @@ addLogLikelihood <- function(llA, llB) { logfile = NULL) { if (!is(object, "SingleCellExperiment")) { - sce <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = object)) + sce <- SingleCellExperiment::SingleCellExperiment(assays = + list(counts = object)) } ## Add the log2 normalized counts into sce object @@ -767,9 +770,11 @@ addLogLikelihood <- function(llA, llB) { ## (celda_CG & Louvian graph algorithm) mvTrend <- scran::trendVar(sce, use.spikes = FALSE) decomposeTrend <- scran::decomposeVar(sce, mvTrend) - topVariableGenes <- order(decomposeTrend$bio, decreasing = TRUE)[1:varGenes] + topVariableGenes <- order(decomposeTrend$bio, + decreasing = TRUE)[seq(varGenes)] } - countsFiltered <- as.matrix(SingleCellExperiment::counts(sce[topVariableGenes, ])) + countsFiltered <- as.matrix(SingleCellExperiment::counts( + sce[topVariableGenes, ])) storage.mode(countsFiltered) <- "integer" .logMessages( @@ -806,8 +811,8 @@ addLogLikelihood <- function(llA, llB) { paste(rep("-", 50), collapse = ""), "\n", paste(rep(" ", 12), collapse = ""), - "Use umap to reduce features into 2 dimensions for cell community detection", - "\n", + "Use umap to reduce features into 2 dimensions", + " for cell community detection\n", paste(rep(" ", 12), collapse = ""), paste(rep("-", 50), collapse = ""), sep = "", From 18c4b34d5c51f0f5bb594914605173f8da5d24d3 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Fri, 13 Dec 2019 12:01:56 -0500 Subject: [PATCH 102/149] customizable cluster labels fix #226 --- R/plot_dr.R | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index e546d32b..ffc7df99 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -410,6 +410,7 @@ plotDimReduceCluster <- function(dim1, labelClusters = FALSE, groupBy = NULL, labelSize = 3.5) { + if (!is.null(groupBy)) { df <- data.frame(dim1, dim2, cluster, groupBy) colnames(df) <- c(xlab, ylab, "Cluster", "Sample") @@ -417,13 +418,16 @@ plotDimReduceCluster <- function(dim1, df <- data.frame(dim1, dim2, cluster) colnames(df) <- c(xlab, ylab, "Cluster") } + naIx <- is.na(dim1) | is.na(dim2) df <- df[!naIx, ] df[3] <- as.factor(df[[3]]) clusterColors <- distinctColors(nlevels(as.factor(cluster))) + if (!is.null(specificClusters)) { clusterColors[!levels(df[[3]]) %in% specificClusters] <- "gray92" } + g <- ggplot2::ggplot(df, ggplot2::aes_string(x = xlab, y = ylab)) + ggplot2::geom_point(stat = "identity", size = size, @@ -441,21 +445,23 @@ plotDimReduceCluster <- function(dim1, #centroidList <- lapply(seq(length(unique(cluster))), function(x) { centroidList <- lapply(unique(cluster), function(x) { df.sub <- df[df$Cluster == x, ] - median.1 <- stats::median(df.sub$Dimension_1) - median.2 <- stats::median(df.sub$Dimension_2) + median.1 <- stats::median(df.sub[, xlab]) + median.2 <- stats::median(df.sub[, ylab]) cbind(median.1, median.2, x) }) centroid <- do.call(rbind, centroidList) - centroid <- as.data.frame(centroid) + centroid <- data.frame(Dimension_1 = as.numeric(centroid[, 1]), + Dimension_2 = as.numeric(centroid[, 2]), + Cluster = centroid[, 3]) - colnames(centroid) <- c("Dimension_1", "Dimension_2", "Cluster") + colnames(centroid)[seq(2)] <- c(xlab, ylab) g <- g + ggplot2::geom_point(data = centroid, - mapping = ggplot2::aes_string(x = "Dimension_1", - y = "Dimension_2"), + mapping = ggplot2::aes_string(x = xlab, + y = ylab), size = 0, alpha = 0) + ggrepel::geom_text_repel(data = centroid, - mapping = ggplot2::aes_string(label = "Cluster"), + mapping = ggplot2::aes(label = Cluster), size = labelSize) } if (!is.null(x = groupBy)) { From ad3517a33661e5ead7bc524d3450252d56a7583a Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Fri, 13 Dec 2019 12:23:11 -0500 Subject: [PATCH 103/149] Add doc for ncol. Customizable cluster labels #226 --- R/plot_dr.R | 67 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 14 deletions(-) diff --git a/R/plot_dr.R b/R/plot_dr.R index 03c9b797..530a3887 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -17,8 +17,10 @@ #' The color will be used to signify the midpoint on the scale. #' @param colorHigh Character. A color available from `colors()`. #' The color will be used to signify the highest values on the scale. -#' Default 'blue'. +#' Default 'blue'. #' @param varLabel Character vector. Title for the color legend. +#' @param ncol Integer. Passed to \link[ggplot2]{facet_wrap}. Specify the +#' number of columns for facet wrap. #' @param headers Character vector. If `NULL`, the corresponding rownames are #' used as labels. Otherwise, these headers are used to label the genes. #' @return The plot as a ggplot object @@ -48,6 +50,7 @@ plotDimReduceGrid <- function(dim1, colorMid, colorHigh, varLabel, + ncol = NULL, headers = NULL) { df <- data.frame(dim1, dim2, t(as.data.frame(matrix))) @@ -66,7 +69,6 @@ plotDimReduceGrid <- function(dim1, ggplot2::geom_point(stat = "identity", size = size, ggplot2::aes_string(color = varLabel)) + - ggplot2::facet_wrap(~ facet, labeller = headers) + ggplot2::theme_bw() + ggplot2::scale_colour_gradient2(low = colorLow, high = colorHigh, @@ -79,6 +81,12 @@ plotDimReduceGrid <- function(dim1, panel.spacing = unit(0, "lines"), panel.background = ggplot2::element_blank(), axis.line = ggplot2::element_line(colour = "black")) + if (isFALSE(is.null(ncol))) { + g <- g + ggplot2::facet_wrap(~ facet, labeller = headers, + ncol = ncol) + } else { + g <- g + ggplot2::facet_wrap(~ facet, labeller = headers) + } } else { g <- ggplot2::ggplot(m, ggplot2::aes_string(x = xlab, y = ylab)) + @@ -98,6 +106,11 @@ plotDimReduceGrid <- function(dim1, panel.spacing = unit(0, "lines"), panel.background = ggplot2::element_blank(), axis.line = ggplot2::element_line(colour = "black")) + if (isFALSE(is.null(ncol))) { + g <- g + ggplot2::facet_wrap(~ facet, ncol = ncol) + } else { + g <- g + ggplot2::facet_wrap(~ facet) + } } return(g) } @@ -134,6 +147,8 @@ plotDimReduceGrid <- function(dim1, #' will be used to signify the midpoint on the scale. Default 'white'. #' @param colorHigh Character. A color available from `colors()`. The color #' will be used to signify the highest values on the scale. Default 'red'. +#' @param ncol Integer. Passed to \link[ggplot2]{facet_wrap}. Specify the +#' number of columns for facet wrap. #' @return The plot as a ggplot object #' @examples #' \donttest{ @@ -160,7 +175,8 @@ plotDimReduceFeature <- function(dim1, ylab = "Dimension_2", colorLow = "blue", colorMid = "white", - colorHigh = "red") { + colorHigh = "red", + ncol = NULL) { if (isFALSE(is.null(headers))) { if (length(headers) != length(features)) { @@ -250,6 +266,7 @@ plotDimReduceFeature <- function(dim1, colorMid, colorHigh, varLabel, + ncol, headers) } @@ -283,6 +300,8 @@ plotDimReduceFeature <- function(dim1, #' @param colorHigh Character. A color available from `colors()`. #' The color will be used to signify the highest values on the scale. #' Default 'blue'. +#' @param ncol Integer. Passed to \link[ggplot2]{facet_wrap}. Specify the +#' number of columns for facet wrap. #' @return The plot as a ggplot object #' @examples #' \donttest{ @@ -307,7 +326,8 @@ plotDimReduceModule <- ylab = "Dimension_2", colorLow = "grey", colorMid = NULL, - colorHigh = "blue") { + colorHigh = "blue", + ncol = NULL) { factorized <- factorizeMatrix(celdaMod = celdaMod, counts = counts) @@ -344,7 +364,8 @@ plotDimReduceModule <- colorLow, colorMid, colorHigh, - varLabel) + varLabel, + ncol) } @@ -368,6 +389,8 @@ plotDimReduceModule <- #' If NULL, all clusters will be colored. Default NULL. #' @param labelClusters Logical. Whether the cluster labels are plotted. #' Default FALSE. +#' @param groupBy Character vector. Contains sample labels for each cell. +#' If NULL, all samples will be plotted together. Default NULL. #' @param labelSize Numeric. Sets size of label if labelClusters is TRUE. #' Default 3.5. #' @return The plot as a ggplot object @@ -391,16 +414,26 @@ plotDimReduceCluster <- function(dim1, ylab = "Dimension_2", specificClusters = NULL, labelClusters = FALSE, + groupBy = NULL, labelSize = 3.5) { - df <- data.frame(dim1, dim2, cluster) - colnames(df) <- c(xlab, ylab, "Cluster") + + if (!is.null(groupBy)) { + df <- data.frame(dim1, dim2, cluster, groupBy) + colnames(df) <- c(xlab, ylab, "Cluster", "Sample") + } else { + df <- data.frame(dim1, dim2, cluster) + colnames(df) <- c(xlab, ylab, "Cluster") + } + naIx <- is.na(dim1) | is.na(dim2) df <- df[!naIx, ] df[3] <- as.factor(df[[3]]) clusterColors <- distinctColors(nlevels(as.factor(cluster))) + if (!is.null(specificClusters)) { clusterColors[!levels(df[[3]]) %in% specificClusters] <- "gray92" } + g <- ggplot2::ggplot(df, ggplot2::aes_string(x = xlab, y = ylab)) + ggplot2::geom_point(stat = "identity", size = size, @@ -418,23 +451,29 @@ plotDimReduceCluster <- function(dim1, #centroidList <- lapply(seq(length(unique(cluster))), function(x) { centroidList <- lapply(unique(cluster), function(x) { df.sub <- df[df$Cluster == x, ] - median.1 <- stats::median(df.sub$Dimension_1) - median.2 <- stats::median(df.sub$Dimension_2) + median.1 <- stats::median(df.sub[, xlab]) + median.2 <- stats::median(df.sub[, ylab]) cbind(median.1, median.2, x) }) centroid <- do.call(rbind, centroidList) - centroid <- as.data.frame(centroid) + centroid <- data.frame(Dimension_1 = as.numeric(centroid[, 1]), + Dimension_2 = as.numeric(centroid[, 2]), + Cluster = centroid[, 3]) - colnames(centroid) <- c("Dimension_1", "Dimension_2", "Cluster") + colnames(centroid)[seq(2)] <- c(xlab, ylab) g <- g + ggplot2::geom_point(data = centroid, - mapping = ggplot2::aes_string(x = "Dimension_1", - y = "Dimension_2"), + mapping = ggplot2::aes_string(x = xlab, + y = ylab), size = 0, alpha = 0) + ggrepel::geom_text_repel(data = centroid, - mapping = ggplot2::aes_string(label = "Cluster"), + mapping = ggplot2::aes(label = Cluster), size = labelSize) } + if (!is.null(x = groupBy)) { + g <- g + facet_wrap(facets = vars(!!sym(x = "Sample"))) + + theme(strip.background = element_blank()) + } return(g) } From 8588f03ec8606e5ddcfc2a6c7fcaf0ada5fd7266 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Fri, 13 Dec 2019 12:38:23 -0500 Subject: [PATCH 104/149] fix lints --- R/celda_G.R | 2 +- R/decon.R | 2 +- R/findMarkers.R | 10 +++++----- R/getDecisions.R | 4 ++-- R/semi_pheatmap.R | 8 ++++---- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/R/celda_G.R b/R/celda_G.R index ae6ae5bc..5e88c222 100755 --- a/R/celda_G.R +++ b/R/celda_G.R @@ -1210,7 +1210,7 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), if (is.null(maxCells) || maxCells > ncol(counts)) { maxCells <- ncol(counts) - cellIx <- 1:ncol(counts) + cellIx <- seq(ncol(counts)) } else { cellIx <- sample(seq(ncol(counts)), maxCells) } diff --git a/R/decon.R b/R/decon.R index f296e493..1a1835f3 100644 --- a/R/decon.R +++ b/R/decon.R @@ -750,7 +750,7 @@ addLogLikelihood <- function(llA, llB) { # transitional z label cbZ <- as.integer(plyr::mapvalues(cbZ, from = levels(cbZ), - to = 1:length(levels(cbZ)))) + to = seq(length(levels(cbZ))))) return(list( diff --git a/R/findMarkers.R b/R/findMarkers.R index b4405988..8d5a286c 100644 --- a/R/findMarkers.R +++ b/R/findMarkers.R @@ -101,7 +101,7 @@ findMarkers <- function(features, stop("NA class values") } - if (any(is.na(features))){ + if (any(is.na(features))) { stop("NA feature values") } @@ -178,7 +178,7 @@ findMarkers <- function(features, dendro <- tree$dendro # Create separate trees for each cell type with more than one cluster - newTrees <- lapply(largeCellTypes, function(cellType){ + newTrees <- lapply(largeCellTypes, function(cellType) { # Print current status message("Building tree for cell type ", cellType) @@ -204,7 +204,7 @@ findMarkers <- function(features, newLabels == cellType]))) # Adjust 'rules' table for new tree - newTree$rules <- lapply(newTree$rules, function(rules){ + newTree$rules <- lapply(newTree$rules, function(rules) { rules$level <- rules$level + max(tree$rules[[cellType]]$level) rules <- rbind(tree$rules[[cellType]], rules) }) @@ -268,7 +268,7 @@ findMarkers <- function(features, cellTypeDendro <- newTrees[[cellType]]$dendro # Adjust labels, member count, and midpoint of nodes - dendro <- dendrapply(dendro, function(node){ + dendro <- dendrapply(dendro, function(node) { # Check if in right branch if (cellType %in% as.character(attributes(node)$classLabels)) { # Replace cell type label with subtype labels @@ -311,7 +311,7 @@ findMarkers <- function(features, cellTypeHeight <- attributes(cellTypeDendro)$height cellTypeDendro <- dendrapply(cellTypeDendro, function(node, parentHeight, cellTypeHeight) { - if (attributes(node)$height > 1){ + if (attributes(node)$height > 1) { attributes(node)$height <- parentHeight - 1 - (cellTypeHeight - attributes( node)$height) diff --git a/R/getDecisions.R b/R/getDecisions.R index 118be596..c6d268a7 100644 --- a/R/getDecisions.R +++ b/R/getDecisions.R @@ -34,7 +34,7 @@ getDecisions <- function(rules, features) { } # Function to predict class from list of rules -.predictClass <- function(samp, rules){ +.predictClass <- function(samp, rules) { # Initilize possible classes and level classes <- names(rules) @@ -60,7 +60,7 @@ getDecisions <- function(rules, features) { ruleClass$sample <- samp[ruleClass$feature] # For multiple direction == 1, use one with the top stat - if (sum(ruleClass$direction == 1) > 1){ + if (sum(ruleClass$direction == 1) > 1) { ruleClass <- ruleClass[order( ruleClass$direction , decreasing = T), ] diff --git a/R/semi_pheatmap.R b/R/semi_pheatmap.R index d977a828..f8169b53 100755 --- a/R/semi_pheatmap.R +++ b/R/semi_pheatmap.R @@ -755,7 +755,7 @@ vplayout <- function(x, y) { } # Omit border color if cell size is too small - if (mindim < 3){ + if (mindim < 3) { borderColor <- NA } @@ -898,8 +898,8 @@ vplayout <- function(x, y) { } # Draw annotation legend - annotation <- c(annotationCol[length(annotationCol):1], - annotationRow[length(annotationRow):1]) + annotation <- c(annotationCol[seq(length(annotationCol), 1)], + annotationRow[seq(length(annotationRow), 1)]) annotation <- annotation[unlist(lapply(annotation, function(x) !.is.na2(x)))] @@ -1494,7 +1494,7 @@ vplayout <- function(x, y) { #' clusteringDistanceCols = dcols) #' #' # Modify ordering of the clusters using clustering callback option -#' callback = function(hc, mat){ +#' callback = function(hc, mat) { #' sv = svd(t(mat))$v[, 1] #' dend = reorder(as.dendrogram(hc), wts = sv) #' as.hclust(dend) From ac66da49ab98612c35332f61d89f31894deb0675 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Mon, 16 Dec 2019 11:29:41 -0500 Subject: [PATCH 105/149] version bump update docs --- DESCRIPTION | 2 +- man/plotDimReduceFeature.Rd | 3 +++ man/plotDimReduceGrid.Rd | 5 ++++- man/plotDimReduceModule.Rd | 3 +++ 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 86c07061..d4cb692a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: celda Title: CEllular Latent Dirichlet Allocation -Version: 1.3.0 +Version: 1.3.1 Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), diff --git a/man/plotDimReduceFeature.Rd b/man/plotDimReduceFeature.Rd index 871b356b..812e7733 100644 --- a/man/plotDimReduceFeature.Rd +++ b/man/plotDimReduceFeature.Rd @@ -62,6 +62,9 @@ will be used to signify the midpoint on the scale. Default 'white'.} \item{colorHigh}{Character. A color available from `colors()`. The color will be used to signify the highest values on the scale. Default 'red'.} + +\item{ncol}{Integer. Passed to \link[ggplot2]{facet_wrap}. Specify the +number of columns for facet wrap.} } \value{ The plot as a ggplot object diff --git a/man/plotDimReduceGrid.Rd b/man/plotDimReduceGrid.Rd index 1f9b328c..e6afc0b6 100644 --- a/man/plotDimReduceGrid.Rd +++ b/man/plotDimReduceGrid.Rd @@ -44,10 +44,13 @@ The color will be used to signify the midpoint on the scale.} \item{colorHigh}{Character. A color available from `colors()`. The color will be used to signify the highest values on the scale. - Default 'blue'.} +Default 'blue'.} \item{varLabel}{Character vector. Title for the color legend.} +\item{ncol}{Integer. Passed to \link[ggplot2]{facet_wrap}. Specify the +number of columns for facet wrap.} + \item{headers}{Character vector. If `NULL`, the corresponding rownames are used as labels. Otherwise, these headers are used to label the genes.} } diff --git a/man/plotDimReduceModule.Rd b/man/plotDimReduceModule.Rd index 8ab5f99d..1fc90c71 100644 --- a/man/plotDimReduceModule.Rd +++ b/man/plotDimReduceModule.Rd @@ -56,6 +56,9 @@ The color will be used to signify the midpoint on the scale.} \item{colorHigh}{Character. A color available from `colors()`. The color will be used to signify the highest values on the scale. Default 'blue'.} + +\item{ncol}{Integer. Passed to \link[ggplot2]{facet_wrap}. Specify the +number of columns for facet wrap.} } \value{ The plot as a ggplot object From 010b39c1c76915d3eca7ce2296ae7da9bf754b7e Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Fri, 27 Dec 2019 21:35:04 -0500 Subject: [PATCH 106/149] Updated messaging --- R/decon.R | 168 ++++++++++++++++++------------------------------------ 1 file changed, 57 insertions(+), 111 deletions(-) diff --git a/R/decon.R b/R/decon.R index f95e8370..d09993aa 100644 --- a/R/decon.R +++ b/R/decon.R @@ -305,7 +305,7 @@ decontX <- function(counts, L = NULL, dbscanEps = NULL, seed = 12345) { - + if (is.null(seed)) { res <- .decontX(counts = counts, z = z, @@ -359,18 +359,20 @@ decontX <- function(counts, nC <- ncol(counts) allCellNames <- colnames(counts) - .logMessages( - paste(rep("-", 50), collapse = ""), - "\n", - "Starting DecontX. Decontamination", - "\n", - paste(rep("-", 50), collapse = ""), - sep = "", + startTime <- Sys.time() + .logMessages(paste(rep("-", 50), collapse = ""), logfile = logfile, append = TRUE, - verbose = verbose - ) - + verbose = verbose) + .logMessages("Starting DecontX", + logfile = logfile, + append = TRUE, + verbose = verbose) + .logMessages(paste(rep("-", 50), collapse = ""), + logfile = logfile, + append = TRUE, + verbose = verbose) + if (!is.null(batch)) { ## Set result lists upfront for all cells from different batches logLikelihood <- c() @@ -388,19 +390,12 @@ decontX <- function(counts, for (bat in batchIndex) { .logMessages( - paste(rep(" ", 4), collapse = ""), - paste(rep("-", 50), collapse = ""), - "\n", - paste(rep(" ", 4), collapse = ""), - "Estimate contamination within batch ", - bat, - "\n", - paste(rep(" ", 4), collapse = ""), - paste(rep("-", 50), collapse = ""), - sep = "", - logfile = logfile, - append = TRUE, - verbose = verbose + date(), + ".. Analyzing cells in batch", + bat, + logfile = logfile, + append = TRUE, + verbose = verbose ) zBat <- NULL @@ -477,23 +472,20 @@ decontX <- function(counts, } } - zMessage <- "" - if (is.null(z)) { - zMessage <- paste0("\nEstimated cell clusters z is saved in the", - " result as well.") - } - .logMessages( - paste(rep("-", 50), collapse = ""), - "\n", - "All is done", - zMessage, - "\n", - paste(rep("-", 50), collapse = ""), - sep = "", + endTime <- Sys.time() + .logMessages(paste(rep("-", 50), collapse = ""), logfile = logfile, append = TRUE, - verbose = verbose - ) + verbose = verbose) + .logMessages("Completed DecontX. Total time:", + format(difftime(endTime, startTime)), + logfile = logfile, + append = TRUE, + verbose = verbose) + .logMessages(paste(rep("-", 50), collapse = ""), + logfile = logfile, + append = TRUE, + verbose = verbose) return(returnResult) @@ -520,18 +512,8 @@ decontX <- function(counts, if (is.null(z)) { .logMessages( - paste(rep(" ", 8), collapse = ""), - paste(rep("-", 50), collapse = ""), - "\n", - paste(rep(" ", 8), collapse = ""), - "Start to estimate broad cell types", - "\n", - paste(rep(" ", 8), collapse = ""), - "which will then be used for DecontX contamination estimation.", - "\n", - paste(rep(" ", 8), collapse = ""), - paste(rep("-", 50), collapse = ""), - sep = "", + date(), + ".. Estimating cell types with Celda", logfile = logfile, append = TRUE, verbose = verbose @@ -558,20 +540,12 @@ decontX <- function(counts, stopIter <- 3L .logMessages( - paste(rep(" ", 8), collapse = ""), - paste(rep("-", 50), collapse = ""), - "\n", - paste(rep(" ", 8), collapse = ""), - "Estimate contamination", - "\n", - paste(rep(" ", 8), collapse = ""), - paste(rep("-", 50), collapse = ""), - sep = "", + date(), + ".. Estimating contamination", logfile = logfile, append = TRUE, verbose = verbose ) - startTime <- Sys.time() if (deconMethod == "clustering") { ## Initialization @@ -633,39 +607,29 @@ decontX <- function(counts, } else { numIterWithoutImprovement <- numIterWithoutImprovement + 1L } - iter <- iter + 1L + + .logMessages(date(), + ".... Completed iteration:", + iter, + "| logLik:", + llTemp, + logfile = logfile, + append = TRUE, + verbose = verbose) + + iter <- iter + 1L } } resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) - endTime <- Sys.time() if (!is.null(batch)) { batchMessage <- paste(" ", "in batch ", batch, ".", sep = "") } else { batchMessage <- "." } - .logMessages( - paste(rep(" ", 8), collapse = ""), - paste(rep("-", 50), collapse = ""), - "\n", - paste(rep(" ", 8), collapse = ""), - "Contamination estimation is completed", - batchMessage, - "\n", - paste(rep(" ", 8), collapse = ""), - "DecontX time: ", - format(difftime(endTime, startTime)), - "\n", - paste(rep(" ", 8), collapse = ""), - paste(rep("-", 50), collapse = ""), - sep = "", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - + runParams <- list("deltaInit" = deltaInit, "iteration" = iter - 1L, "z" = z) @@ -778,17 +742,10 @@ addLogLikelihood <- function(llA, llB) { storage.mode(countsFiltered) <- "integer" .logMessages( - paste(rep(" ", 12), collapse = ""), - paste(rep("-", 50), collapse = ""), - "\n", - paste(rep(" ", 12), collapse = ""), - "Collapse genes into ", + date(), + ".... Collapsing features into", L, - " gene modules", - "\n", - paste(rep(" ", 12), collapse = ""), - paste(rep("-", 50), collapse = ""), - sep = "", + "modules", logfile = logfile, append = TRUE, verbose = verbose @@ -807,15 +764,8 @@ addLogLikelihood <- function(llA, llB) { } .logMessages( - paste(rep(" ", 12), collapse = ""), - paste(rep("-", 50), collapse = ""), - "\n", - paste(rep(" ", 12), collapse = ""), - "Use umap to reduce features into 2 dimensions", - " for cell community detection\n", - paste(rep(" ", 12), collapse = ""), - paste(rep("-", 50), collapse = ""), - sep = "", + date(), + ".... Reducing dimensionality with UMAP", logfile = logfile, append = TRUE, verbose = verbose @@ -827,15 +777,11 @@ addLogLikelihood <- function(llA, llB) { rm(fm) .logMessages( - paste(rep(" ", 12), collapse = ""), - paste(rep("-", 50), collapse = ""), - "\n", - paste(rep(" ", 12), collapse = ""), - "Use density-based model DBSCAN to detect cell community", - "\n", - paste(rep(" ", 12), collapse = ""), - paste(rep("-", 50), collapse = ""), - sep = "", + date(), + " .... Determining cell clusters with DBSCAN (Eps=", + dbscanEps, + ")", + sep="", logfile = logfile, append = TRUE, verbose = verbose From 12e008ea54d8f4c7855217c2d7fad6e5f2c16c78 Mon Sep 17 00:00:00 2001 From: Irisapo Date: Sat, 28 Dec 2019 08:50:59 -0500 Subject: [PATCH 107/149] fix mistake in mcmc fix-iteration --- R/decon.R | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/R/decon.R b/R/decon.R index f95e8370..b6ec8faf 100644 --- a/R/decon.R +++ b/R/decon.R @@ -182,16 +182,23 @@ simulateContaminatedMatrix <- function(C = 300, Pr <- exp(logPr) / (exp(logPr) + exp(logPc)) Pc <- 1 - Pr - deltaV2 <- - MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha + #deltaV2 <- + # MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha estRmat <- t(Pr) * counts rnGByK <- .colSumByGroupNumeric(estRmat, z, K) cnGByK <- rowSums(rnGByK) - rnGByK + TNbyC <- colSums(counts) + estRbyCol <- colSums(estRmat) + + PrbyC <- estRbyCol / TNbyC + PcbyC <- 1 - PrbyC + deltaV2 <- MCMCprecision::fit_dirichlet(cbind(PrbyC, PcbyC))$alpha + ## Update parameters theta <- - (colSums(estRmat) + deltaV2[1]) / (colSums(counts) + sum(deltaV2)) + (estRbyCol + deltaV2[1]) / (TNbyC + sum(deltaV2)) phi <- normalizeCounts(rnGByK, normalize = "proportion", pseudocountNormalize = 1e-20) From ea7c67a3963980fc5089d77b1a23bc4b8a3bbf6f Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Sat, 28 Dec 2019 12:47:37 -0500 Subject: [PATCH 108/149] Corrected call to dirichlet_fit to estimate deltas. Also made some speed improvements --- R/decon.R | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/R/decon.R b/R/decon.R index d09993aa..d4e1ec5d 100644 --- a/R/decon.R +++ b/R/decon.R @@ -177,21 +177,35 @@ simulateContaminatedMatrix <- function(C = 300, delta) { ## Notes: use fix-point iteration to update prior for theta, no need ## to feed delta anymore + logPr <- log(t(phi)[z, ] + 1e-20) + log(theta + 1e-20) logPc <- log(t(eta)[z, ] + 1e-20) + log(1 - theta + 1e-20) - - Pr <- exp(logPr) / (exp(logPr) + exp(logPc)) - Pc <- 1 - Pr - deltaV2 <- - MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha - - estRmat <- t(Pr) * counts +# Pr <- exp(logPr) / (exp(logPr) + exp(logPc)) +#print(head(logPr[1:5,1:5])) +# logPr <- t(log(phi + 1e-20)[,z]) + log(theta + 1e-20) +# logPc <- t(log(eta + 1e-20)[,z]) + log(1 - theta + 1e-20) +#print(head(logPr1[1:5,1:5])) + Pr.e <- exp(logPr) + Pc.e <- exp(logPc) + Pr <- Pr.e / (Pr.e + Pc.e) +# Pc <- 1 - Pr +# deltaV2 <- MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha + + estRmat <- t(Pr) * counts +# estRmat <- counts * Pr rnGByK <- .colSumByGroupNumeric(estRmat, z, K) cnGByK <- rowSums(rnGByK) - rnGByK + counts.cs = colSums(counts) + estRmat.cs <- colSums(estRmat) + estRmat.cs.n <- estRmat.cs / counts.cs + estCmat.cs.n <- 1 - estRmat.cs.n + temp <- cbind(estRmat.cs.n, estCmat.cs.n) + deltaV2 <- MCMCprecision::fit_dirichlet(temp)$alpha + ## Update parameters theta <- - (colSums(estRmat) + deltaV2[1]) / (colSums(counts) + sum(deltaV2)) + (estRmat.cs + deltaV2[1]) / (counts.cs + sum(deltaV2)) phi <- normalizeCounts(rnGByK, normalize = "proportion", pseudocountNormalize = 1e-20) From 402ad7e32c42e1f843bc8868f2f4f0a5b2edb301 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Fri, 3 Jan 2020 09:08:34 -0500 Subject: [PATCH 109/149] Added new fast DecontX functions which utilized RcppArmadillo sparse matrix operations --- src/DecontX.cpp | 221 ++++++++++++++++++++++++++++++++++++++++++++ src/Makevars | 17 +++- src/Makevars.win | 17 +++- src/RcppExports.cpp | 51 ++++++++++ 4 files changed, 296 insertions(+), 10 deletions(-) create mode 100644 src/DecontX.cpp diff --git a/src/DecontX.cpp b/src/DecontX.cpp new file mode 100644 index 00000000..004bf74f --- /dev/null +++ b/src/DecontX.cpp @@ -0,0 +1,221 @@ +// [[Rcpp::depends(RcppArmadillo)]] +#include +using namespace Rcpp; + + +// [[Rcpp::export]] +Rcpp::List decontXEM(const arma::sp_mat& counts, + const NumericVector& counts_colsums, + const NumericVector& theta, + const NumericMatrix& eta, + const NumericMatrix& phi, + const IntegerVector& z, + const double& pseudocount) { + + // Perform error checking + if(counts.n_cols != theta.size()) { + stop("Length of 'theta' must be equal to the number of columns in 'counts'."); + } + if(counts.n_cols != z.size()) { + stop("Length of 'z' must be equal to the number of columns in 'counts'."); + } + if(counts.n_cols != counts_colsums.size()) { + stop("Length of 'counts_colsums' must be equal to the number of columns in 'counts'."); + } + if(counts.n_rows != phi.nrow()) { + stop("The number of rows in 'phi' must be equal to the number of rows in 'counts'."); + } + if(counts.n_rows != eta.nrow()) { + stop("The number of rows in 'eta' must be equal to the number of rows in 'counts'."); + } + if(phi.ncol() != eta.ncol()) { + stop("The number of columns in 'eta' must be equal to the number of columns in 'phi'."); + } + if(min(z) < 1 || max(z) > eta.ncol()) { + stop("The entries in 'z' need to be between 1 and the number of columns in eta and phi."); + } + + // Declare variables and functions + NumericVector new_theta(theta.size()); + NumericVector native_total(theta.size()); + NumericMatrix new_phi(phi.nrow(), phi.ncol()); + NumericMatrix new_eta(eta.nrow(), eta.ncol()); + + std::fill(new_phi.begin(), new_phi.end(), pseudocount); + std::fill(new_eta.begin(), new_eta.end(), pseudocount); + + // Obtaining 'fit_dirichlet' function from MCMCprecision package + Environment pkg = Environment::namespace_env("MCMCprecision"); + Function f = pkg["fit_dirichlet"]; + + int i; + int j; + int k; + double x; + double pcontamin; + double pnative; + double normp; + double px; + for (arma::sp_mat::const_iterator it = counts.begin(); it != counts.end(); ++it) { + i = it.row(); + j = it.col(); + x = *it; + k = z[j] - 1; + + // Calculate variational probabilities + pnative = log(phi(i,k) + pseudocount) + log(theta(j) + pseudocount); + pcontamin = log(eta(i,k) + pseudocount) + log(1 - theta(j) + pseudocount); + + // Normalize probabilities and add to proper components + normp = exp(pnative) / (exp(pcontamin) + exp(pnative)); + px = normp * x; + new_phi(i,k) += px; + native_total(j) += px; + } + + // Calculate Eta using Weights from Phi + NumericVector phi_rowsum = rowSums(new_phi); + for(i = 0; i < new_eta.ncol(); i++) { + for(j = 0; j < new_eta.nrow(); j++) { + new_eta(j,i) = phi_rowsum[j] - new_phi(j,i); + } + } + + // Normalize Phi and Eta + NumericVector phi_colsum = colSums(new_phi); + NumericVector eta_colsum = colSums(new_eta); + for(i = 0; i < new_phi.ncol(); i++) { + new_phi(_,i) = new_phi(_,i) / phi_colsum[i]; + new_eta(_,i) = new_eta(_,i) / eta_colsum[i]; + } + + // Update Theta + NumericVector contamination_prop = (counts_colsums - native_total) / counts_colsums; + NumericVector native_prop = 1 - contamination_prop; + NumericMatrix theta_raw = cbind(native_prop, contamination_prop); + + Rcpp::List result = f(Named("x", theta_raw)); + NumericVector delta = result["alpha"]; + new_theta = (native_total + delta[0]) / (counts_colsums + result["sum"]); + + return Rcpp::List::create(Rcpp::Named("phi") = new_phi, + Rcpp::Named("eta") = new_eta, + Rcpp::Named("theta") = new_theta, + Rcpp::Named("delta") = delta, + Rcpp::Named("contamination") = contamination_prop); +} + + + + +// [[Rcpp::export]] +double decontXLogLik(const arma::sp_mat& counts, + const NumericVector& theta, + const NumericMatrix& eta, + const NumericMatrix& phi, + const IntegerVector& z, + const double& pseudocount) { + + // Perform error checking + if(counts.n_cols != theta.size()) { + stop("Length of 'theta' must be equal to the number of columns in 'counts'."); + } + if(counts.n_cols != z.size()) { + stop("Length of 'z' must be equal to the number of columns in 'counts'."); + } + if(counts.n_rows != phi.nrow()) { + stop("The number of rows in 'phi' must be equal to the number of rows in 'counts'."); + } + if(counts.n_rows != eta.nrow()) { + stop("The number of rows in 'eta' must be equal to the number of rows in 'counts'."); + } + if(phi.ncol() != eta.ncol()) { + stop("The number of columns in 'eta' must be equal to the number of columns in 'phi'."); + } + if(min(z) < 1 || max(z) > eta.ncol()) { + stop("The entries in 'z' need to be between 1 and the number of columns in eta and phi."); + } + + // Declare variables and functions + double loglik = 0; + + int i; + int j; + int k; + double x; + + // Original R code: + // ll <- sum(Matrix::t(counts) * log(theta * t(phi)[z, ] + + // (1 - theta) * t(eta)[z, ] + 1e-20)) + + for (arma::sp_mat::const_iterator it = counts.begin(); it != counts.end(); ++it) { + i = it.row(); + j = it.col(); + x = *it; + k = z[j] - 1; + + loglik += x * log((phi(i,k) * theta(j)) + (eta(i,k) * (1 - theta(j))) + pseudocount); + } + + return loglik; +} + + + + + +// [[Rcpp::export]] +Rcpp::List decontXInitialize(const arma::sp_mat& counts, + const NumericVector& theta, + const IntegerVector& z, + const double& pseudocount) { + + // Perform error checking + if(counts.n_cols != theta.size()) { + stop("Length of 'theta' must be equal to the number of columns in 'counts'."); + } + if(counts.n_cols != z.size()) { + stop("Length of 'z' must be equal to the number of columns in 'counts'."); + } + + // Declare variables and functions + NumericMatrix new_phi(counts.n_rows, max(z)); + NumericMatrix new_eta(counts.n_rows, max(z)); + std::fill(new_phi.begin(), new_phi.end(), pseudocount); + std::fill(new_eta.begin(), new_eta.end(), pseudocount); + + int i; + int j; + int k; + double x; + + for (arma::sp_mat::const_iterator it = counts.begin(); it != counts.end(); ++it) { + i = it.row(); + j = it.col(); + x = *it; + k = z[j] - 1; + + new_phi(i,k) += x * theta(j); + } + + // Calculate Eta using Weights from Phi + NumericVector phi_rowsum = rowSums(new_phi); + for(i = 0; i < new_eta.ncol(); i++) { + for(j = 0; j < new_eta.nrow(); j++) { + new_eta(j,i) = phi_rowsum[j] - new_phi(j,i); + } + } + + // Normalize Phi and Eta + NumericVector phi_colsum = colSums(new_phi); + NumericVector eta_colsum = colSums(new_eta); + for(i = 0; i < new_phi.ncol(); i++) { + new_phi(_,i) = new_phi(_,i) / phi_colsum[i]; + new_eta(_,i) = new_eta(_,i) / eta_colsum[i]; + } + + return Rcpp::List::create(Rcpp::Named("phi") = new_phi, + Rcpp::Named("eta") = new_eta); + +} + diff --git a/src/Makevars b/src/Makevars index 8fa55139..d3e3f414 100755 --- a/src/Makevars +++ b/src/Makevars @@ -1,7 +1,14 @@ -## With Rcpp 0.11.0 and later, we no longer need to set PKG_LIBS as there is -## no user-facing library. The include path to headers is already set by R. -#PKG_LIBS = ## With R 3.1.0 or later, you can uncomment the following line to tell R to -## enable compilation with C++11 (or even C++14) where available -#CXX_STD = CXX11 +## enable compilation with C++11 (where available) +## +## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider +## availability of the package we do not yet enforce this here. It is however +## recommended for client packages to set it. +## +## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP +## support within Armadillo prefers / requires it +CXX_STD = CXX11 + +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/Makevars.win b/src/Makevars.win index 8fa55139..d3e3f414 100755 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,7 +1,14 @@ -## With Rcpp 0.11.0 and later, we no longer need to set PKG_LIBS as there is -## no user-facing library. The include path to headers is already set by R. -#PKG_LIBS = ## With R 3.1.0 or later, you can uncomment the following line to tell R to -## enable compilation with C++11 (or even C++14) where available -#CXX_STD = CXX11 +## enable compilation with C++11 (where available) +## +## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider +## availability of the package we do not yet enforce this here. It is however +## recommended for client packages to set it. +## +## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP +## support within Armadillo prefers / requires it +CXX_STD = CXX11 + +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 118e61c4..6b2f3bf3 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -2,10 +2,58 @@ // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include +#include #include using namespace Rcpp; +// decontXEM +Rcpp::List decontXEM(const arma::sp_mat& counts, const NumericVector& counts_colsums, const NumericVector& theta, const NumericMatrix& eta, const NumericMatrix& phi, const IntegerVector& z, const double& pseudocount); +RcppExport SEXP _celda_decontXEM(SEXP countsSEXP, SEXP counts_colsumsSEXP, SEXP thetaSEXP, SEXP etaSEXP, SEXP phiSEXP, SEXP zSEXP, SEXP pseudocountSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::sp_mat& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type counts_colsums(counts_colsumsSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type theta(thetaSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type eta(etaSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type phi(phiSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type z(zSEXP); + Rcpp::traits::input_parameter< const double& >::type pseudocount(pseudocountSEXP); + rcpp_result_gen = Rcpp::wrap(decontXEM(counts, counts_colsums, theta, eta, phi, z, pseudocount)); + return rcpp_result_gen; +END_RCPP +} +// decontXLogLik +double decontXLogLik(const arma::sp_mat& counts, const NumericVector& theta, const NumericMatrix& eta, const NumericMatrix& phi, const IntegerVector& z, const double& pseudocount); +RcppExport SEXP _celda_decontXLogLik(SEXP countsSEXP, SEXP thetaSEXP, SEXP etaSEXP, SEXP phiSEXP, SEXP zSEXP, SEXP pseudocountSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::sp_mat& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type theta(thetaSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type eta(etaSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type phi(phiSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type z(zSEXP); + Rcpp::traits::input_parameter< const double& >::type pseudocount(pseudocountSEXP); + rcpp_result_gen = Rcpp::wrap(decontXLogLik(counts, theta, eta, phi, z, pseudocount)); + return rcpp_result_gen; +END_RCPP +} +// decontXInitialize +Rcpp::List decontXInitialize(const arma::sp_mat& counts, const NumericVector& theta, const IntegerVector& z, const double& pseudocount); +RcppExport SEXP _celda_decontXInitialize(SEXP countsSEXP, SEXP thetaSEXP, SEXP zSEXP, SEXP pseudocountSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::sp_mat& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type theta(thetaSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type z(zSEXP); + Rcpp::traits::input_parameter< const double& >::type pseudocount(pseudocountSEXP); + rcpp_result_gen = Rcpp::wrap(decontXInitialize(counts, theta, z, pseudocount)); + return rcpp_result_gen; +END_RCPP +} // cG_calcGibbsProbY_Simple NumericVector cG_calcGibbsProbY_Simple(const IntegerMatrix counts, IntegerVector nGbyTS, IntegerMatrix nTSbyC, IntegerVector nbyTS, IntegerVector nbyG, const IntegerVector y, const int L, const int index, const double gamma, const double beta, const double delta); RcppExport SEXP _celda_cG_calcGibbsProbY_Simple(SEXP countsSEXP, SEXP nGbyTSSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP indexSEXP, SEXP gammaSEXP, SEXP betaSEXP, SEXP deltaSEXP) { @@ -154,6 +202,9 @@ RcppExport SEXP _rowSumByGroup_numeric(SEXP, SEXP); RcppExport SEXP _rowSumByGroupChange(SEXP, SEXP, SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { + {"_celda_decontXEM", (DL_FUNC) &_celda_decontXEM, 7}, + {"_celda_decontXLogLik", (DL_FUNC) &_celda_decontXLogLik, 6}, + {"_celda_decontXInitialize", (DL_FUNC) &_celda_decontXInitialize, 4}, {"_celda_cG_calcGibbsProbY_Simple", (DL_FUNC) &_celda_cG_calcGibbsProbY_Simple, 11}, {"_celda_cG_CalcGibbsProbY_ori", (DL_FUNC) &_celda_cG_CalcGibbsProbY_ori, 13}, {"_celda_cG_CalcGibbsProbY_fastRow", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fastRow, 13}, From a8930fa5da36ab126cb0be443a1b757181204a1e Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Fri, 3 Jan 2020 09:24:03 -0500 Subject: [PATCH 110/149] changed order of LinksTo in description to avoid compile errors with Armadillo. Removed 'import Matrix' to avoid warnings --- DESCRIPTION | 3 +- R/RcppExports.R | 12 ++++ R/decon.R | 166 ++++++++++++++++++++++++++++---------------- man/decontX.Rd | 1 + src/DecontX.cpp | 5 +- src/RcppExports.cpp | 2 +- 6 files changed, 125 insertions(+), 64 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d4cb692a..4a7cc268 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,6 +33,7 @@ Imports: S4Vectors, data.table, Rcpp, + RcppArmadillo, RcppEigen, uwot, enrichR, @@ -62,7 +63,7 @@ Suggests: M3DExampleData, BiocManager, BiocStyle -LinkingTo: Rcpp, RcppEigen +LinkingTo: Rcpp, RcppArmadillo, RcppEigen License: MIT + file LICENSE Encoding: UTF-8 LazyData: false diff --git a/R/RcppExports.R b/R/RcppExports.R index 4734cebd..55140e14 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,18 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +decontXEM <- function(counts, counts_colsums, theta, eta, phi, z, pseudocount) { + .Call('_celda_decontXEM', PACKAGE = 'celda', counts, counts_colsums, theta, eta, phi, z, pseudocount) +} + +decontXLogLik <- function(counts, theta, eta, phi, z, pseudocount) { + .Call('_celda_decontXLogLik', PACKAGE = 'celda', counts, theta, eta, phi, z, pseudocount) +} + +decontXInitialize <- function(counts, theta, z, pseudocount) { + .Call('_celda_decontXInitialize', PACKAGE = 'celda', counts, theta, z, pseudocount) +} + cG_calcGibbsProbY_Simple <- function(counts, nGbyTS, nTSbyC, nbyTS, nbyG, y, L, index, gamma, beta, delta) { .Call('_celda_cG_calcGibbsProbY_Simple', PACKAGE = 'celda', counts, nGbyTS, nTSbyC, nbyTS, nbyG, y, L, index, gamma, beta, delta) } diff --git a/R/decon.R b/R/decon.R index d4e1ec5d..d882baf5 100644 --- a/R/decon.R +++ b/R/decon.R @@ -142,7 +142,7 @@ simulateContaminatedMatrix <- function(C = 300, .deconCalcLL <- function(counts, z, phi, eta, theta) { # ll = sum( t(counts) * log( (1-conP )*geneDist[z,] + conP * conDist[z, ] + # 1e-20 ) ) # when dist_mat are K x G matrices - ll <- sum(t(counts) * log(theta * t(phi)[z, ] + + ll <- sum(Matrix::t(counts) * log(theta * t(phi)[z, ] + (1 - theta) * t(eta)[z, ] + 1e-20)) return(ll) } @@ -180,19 +180,11 @@ simulateContaminatedMatrix <- function(C = 300, logPr <- log(t(phi)[z, ] + 1e-20) + log(theta + 1e-20) logPc <- log(t(eta)[z, ] + 1e-20) + log(1 - theta + 1e-20) -# Pr <- exp(logPr) / (exp(logPr) + exp(logPc)) -#print(head(logPr[1:5,1:5])) -# logPr <- t(log(phi + 1e-20)[,z]) + log(theta + 1e-20) -# logPc <- t(log(eta + 1e-20)[,z]) + log(1 - theta + 1e-20) -#print(head(logPr1[1:5,1:5])) Pr.e <- exp(logPr) Pc.e <- exp(logPc) Pr <- Pr.e / (Pr.e + Pc.e) -# Pc <- 1 - Pr -# deltaV2 <- MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha estRmat <- t(Pr) * counts -# estRmat <- counts * Pr rnGByK <- .colSumByGroupNumeric(estRmat, z, K) cnGByK <- rowSums(rnGByK) - rnGByK @@ -313,6 +305,7 @@ decontX <- function(counts, batch = NULL, maxIter = 200, delta = 10, + convergence = 0.01, logfile = NULL, verbose = TRUE, varGenes = NULL, @@ -326,6 +319,7 @@ decontX <- function(counts, batch = batch, maxIter = maxIter, delta = delta, + convergence = convergence, logfile = logfile, verbose = verbose, varGenes = varGenes, @@ -338,6 +332,7 @@ decontX <- function(counts, batch = batch, maxIter = maxIter, delta = delta, + convergence = convergence, logfile = logfile, verbose = verbose, varGenes = varGenes, @@ -354,25 +349,13 @@ decontX <- function(counts, batch = NULL, maxIter = 200, delta = 10, + convergence = 0.01, logfile = NULL, verbose = TRUE, varGenes = NULL, dbscanEps = NULL, L = NULL) { - ## Empty expression genes won't be used for estimation - haveEmptyGenes <- FALSE - totalGenes <- nrow(counts) - noneEmptyGeneIndex <- rowSums(counts) != 0 - geneNames <- rownames(counts) - if (sum(noneEmptyGeneIndex) != totalGenes) { - counts <- counts[noneEmptyGeneIndex, ] - haveEmptyGenes <- TRUE - } - - nC <- ncol(counts) - allCellNames <- colnames(counts) - startTime <- Sys.time() .logMessages(paste(rep("-", 50), collapse = ""), logfile = logfile, @@ -386,16 +369,35 @@ decontX <- function(counts, logfile = logfile, append = TRUE, verbose = verbose) + + # Convert to sparse matrix + # After Celda can run on sparse matrix, + # then we can just have this be required + # as input + counts = as(counts, "dgCMatrix") + + ## Empty expression genes won't be used for estimation + haveEmptyGenes <- FALSE + totalGenes <- nrow(counts) + noneEmptyGeneIndex <- Matrix::rowSums(counts) != 0 + geneNames <- rownames(counts) + if (sum(noneEmptyGeneIndex) != totalGenes) { + counts <- counts[noneEmptyGeneIndex, ] + haveEmptyGenes <- TRUE + } + + nC <- ncol(counts) + allCellNames <- colnames(counts) if (!is.null(batch)) { ## Set result lists upfront for all cells from different batches logLikelihood <- c() - estRmat <- matrix( - 0, - ncol = ncol(counts), - nrow = totalGenes, - dimnames = list(geneNames, allCellNames) - ) +# estRmat <- matrix( +# 0, +# ncol = ncol(counts), +# nrow = totalGenes, +# dimnames = list(geneNames, allCellNames) +# ) theta <- rep(NA, nC) estConp <- rep(NA, nC) returnZ <- rep(NA, nC) @@ -423,6 +425,7 @@ decontX <- function(counts, batch = bat, maxIter = maxIter, delta = delta, + convergence = convergence, logfile = logfile, verbose = verbose, varGenes = varGenes, @@ -472,6 +475,7 @@ decontX <- function(counts, z = z, maxIter = maxIter, delta = delta, + convergence = convergence, logfile = logfile, verbose = verbose, varGenes = varGenes, @@ -512,6 +516,7 @@ decontX <- function(counts, batch = NULL, maxIter = 200, delta = 10, + convergence = 0.01, logfile = NULL, verbose = TRUE, varGenes = NULL, @@ -560,7 +565,7 @@ decontX <- function(counts, append = TRUE, verbose = verbose ) - + if (deconMethod == "clustering") { ## Initialization deltaInit <- delta @@ -568,37 +573,64 @@ decontX <- function(counts, theta <- stats::rbeta(n = nC, shape1 = deltaInit, shape2 = deltaInit) - estRmat <- t(t(counts) * theta) - phi <- .colSumByGroupNumeric(estRmat, z, K) - eta <- rowSums(phi) - phi - phi <- normalizeCounts(phi, - normalize = "proportion", - pseudocountNormalize = 1e-20) - eta <- normalizeCounts(eta, - normalize = "proportion", - pseudocountNormalize = 1e-20) + + nextDecon <- decontXInitialize( + counts = counts, + theta = theta, + z = z, + pseudocount = 1e-20) + phi <- nextDecon$phi + eta <- nextDecon$eta + +# estRmat <- Matrix::t(Matrix::t(counts) * theta) +# phi <- .colSumByGroupNumeric(as.matrix(estRmat), z, K) +# eta <- rowSums(phi) - phi +# phi <- normalizeCounts(phi, +# normalize = "proportion", +# pseudocountNormalize = 1e-20) +# eta <- normalizeCounts(eta, +# normalize = "proportion", +# pseudocountNormalize = 1e-20) ll <- c() - llRound <- .deconCalcLL( + +# llRound <- .deconCalcLL( +# counts = counts, +# z = z, +# phi = phi, +# eta = eta, +# theta = theta +# ) + llRound <- decontXLogLik( counts = counts, z = z, phi = phi, eta = eta, - theta = theta - ) + theta = theta, + pseudocount = 1e-20) ## EM updates - while (iter <= maxIter & + theta.previous <- theta + converged <- FALSE + counts.colsums = Matrix::colSums(counts) + while (iter <= maxIter & !isTRUE(converged) & numIterWithoutImprovement <= stopIter) { - nextDecon <- .cDCalcEMDecontamination( - counts = counts, - phi = phi, - eta = eta, - theta = theta, - z = z, - K = K, - delta = delta - ) +# nextDecon <- .cDCalcEMDecontamination( +# counts = counts, +# phi = phi, +# eta = eta, +# theta = theta, +# z = z, +# K = K, +# delta = delta +# ) + nextDecon <- decontXEM(counts = counts, + counts_colsums = counts.colsums, + phi = phi, + eta = eta, + theta = theta, + z = z, + pseudocount = 1e-20) theta <- nextDecon$theta phi <- nextDecon$phi @@ -606,13 +638,21 @@ decontX <- function(counts, delta <- nextDecon$delta ## Calculate log-likelihood - llTemp <- .deconCalcLL( +# llTemp <- .deconCalcLL( +# counts = counts, +# z = z, +# phi = phi, +# eta = eta, +# theta = theta +# ) + llTemp <- decontXLogLik( counts = counts, z = z, phi = phi, eta = eta, - theta = theta - ) + theta = theta, + pseudocount = 1e-20) + ll <- c(ll, llTemp) llRound <- c(llRound, round(llTemp, 2)) @@ -622,22 +662,30 @@ decontX <- function(counts, numIterWithoutImprovement <- numIterWithoutImprovement + 1L } + max.divergence <- max(abs(theta.previous - theta)) + if(max.divergence < convergence) { + converged <- TRUE + } + theta.previous <- theta + .logMessages(date(), ".... Completed iteration:", iter, - "| logLik:", - llTemp, + "| converge:", + signif(max.divergence, 4), logfile = logfile, append = TRUE, verbose = verbose) - + iter <- iter + 1L } } - resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) - +# resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) + resConp <- nextDecon$contamination + names(resConp) <- colnames(counts) + if (!is.null(batch)) { batchMessage <- paste(" ", "in batch ", batch, ".", sep = "") } else { diff --git a/man/decontX.Rd b/man/decontX.Rd index 48e3c244..e930b4f2 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -10,6 +10,7 @@ decontX( batch = NULL, maxIter = 200, delta = 10, + convergence = 0.01, logfile = NULL, verbose = TRUE, varGenes = NULL, diff --git a/src/DecontX.cpp b/src/DecontX.cpp index 004bf74f..ce54be7a 100644 --- a/src/DecontX.cpp +++ b/src/DecontX.cpp @@ -1,7 +1,6 @@ -// [[Rcpp::depends(RcppArmadillo)]] #include -using namespace Rcpp; - +// [[Rcpp::depends(RcppArmadillo)]] +using namespace Rcpp; using namespace arma; // [[Rcpp::export]] Rcpp::List decontXEM(const arma::sp_mat& counts, diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 6b2f3bf3..15f89529 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -1,8 +1,8 @@ // Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -#include #include +#include #include using namespace Rcpp; From c46d74c2754b0593c020cb17bde94384373da339 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Fri, 3 Jan 2020 17:17:53 -0500 Subject: [PATCH 111/149] Updated DecontX to calculate native counts in cpp. Made several adjustments to the R decontX call --- R/RcppExports.R | 4 +++ R/decon.R | 78 ++++++++++++++++++++++++++++++++------------- src/DecontX.cpp | 68 +++++++++++++++++++++++++++++++++++++++ src/RcppExports.cpp | 20 ++++++++++++ 4 files changed, 148 insertions(+), 22 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 55140e14..eeaa6b79 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -13,6 +13,10 @@ decontXInitialize <- function(counts, theta, z, pseudocount) { .Call('_celda_decontXInitialize', PACKAGE = 'celda', counts, theta, z, pseudocount) } +calculateNativeMatrix <- function(counts, native_counts, theta, eta, phi, z, row_index, col_index, pseudocount) { + .Call('_celda_calculateNativeMatrix', PACKAGE = 'celda', counts, native_counts, theta, eta, phi, z, row_index, col_index, pseudocount) +} + cG_calcGibbsProbY_Simple <- function(counts, nGbyTS, nTSbyC, nbyTS, nbyG, y, L, index, gamma, beta, delta) { .Call('_celda_cG_calcGibbsProbY_Simple', PACKAGE = 'celda', counts, nGbyTS, nTSbyC, nbyTS, nbyG, y, L, index, gamma, beta, delta) } diff --git a/R/decon.R b/R/decon.R index d882baf5..74d3d338 100644 --- a/R/decon.R +++ b/R/decon.R @@ -305,7 +305,7 @@ decontX <- function(counts, batch = NULL, maxIter = 200, delta = 10, - convergence = 0.01, + convergence = 0.001, logfile = NULL, verbose = TRUE, varGenes = NULL, @@ -349,7 +349,7 @@ decontX <- function(counts, batch = NULL, maxIter = 200, delta = 10, - convergence = 0.01, + convergence = 0.001, logfile = NULL, verbose = TRUE, varGenes = NULL, @@ -379,6 +379,7 @@ decontX <- function(counts, ## Empty expression genes won't be used for estimation haveEmptyGenes <- FALSE totalGenes <- nrow(counts) + totalCells <- ncol(counts) noneEmptyGeneIndex <- Matrix::rowSums(counts) != 0 geneNames <- rownames(counts) if (sum(noneEmptyGeneIndex) != totalGenes) { @@ -388,16 +389,19 @@ decontX <- function(counts, nC <- ncol(counts) allCellNames <- colnames(counts) + + estRmat <- Matrix::Matrix( + data = 0, + ncol = totalCells, + nrow = totalGenes, + sparse = TRUE, + dimnames = list(geneNames, allCellNames) + ) if (!is.null(batch)) { ## Set result lists upfront for all cells from different batches logLikelihood <- c() -# estRmat <- matrix( -# 0, -# ncol = ncol(counts), -# nrow = totalGenes, -# dimnames = list(geneNames, allCellNames) -# ) + theta <- rep(NA, nC) estConp <- rep(NA, nC) returnZ <- rep(NA, nC) @@ -433,13 +437,24 @@ decontX <- function(counts, L = L ) - if (haveEmptyGenes) { - estRmat[noneEmptyGeneIndex, batch == bat] <- - resBat$resList$estNativeCounts - } else { - estRmat[, batch == bat] <- - resBat$resList$estNativeCounts - } +# if (haveEmptyGenes) { +# estRmat[cbind(which(noneEmptyGeneIndex), which(batch == bat))] <- +# resBat$resList$estNativeCounts +# } else { +# estRmat[cbind(seq(nrow(counts)), which(batch == bat))] <- +# resBat$resList$estNativeCounts +# } + estRmat <- calculateNativeMatrix( + counts = countsBat, + native_counts = estRmat, + theta = resBat$resList$theta, + eta = resBat$resList$eta, + row_index = which(noneEmptyGeneIndex), + col_index = which(batch == bat), + phi = resBat$resList$phi, + z = as.integer(resBat$runParams$z), + pseudocount = 1e-20) + estConp[batch == bat] <- resBat$resList$estConp theta[batch == bat] <- resBat$resList$theta returnZ[batch == bat] <- resBat$runParams$z @@ -482,12 +497,29 @@ decontX <- function(counts, dbscanEps = dbscanEps, L = L ) - if (haveEmptyGenes) { - resBat <- matrix(0, nrow = totalGenes, ncol = nC, - dimnames = list(geneNames, allCellNames)) - resBat[noneEmptyGeneIndex, ] <- returnResult$resList$estNativeCounts - returnResult$resList$estNativeCounts <- resBat - } +# if (haveEmptyGenes) { +# resBat <- matrix(0, nrow = totalGenes, ncol = nC, +# dimnames = list(geneNames, allCellNames)) +# ix <- rep(which(noneEmptyGeneIndex), nC) +# jx <- rep(seq(nC), sum(noneEmptyGeneIndex)) +# resBat <- sparseMatrix(i=ix, j=jx, +# x = returnResult$resList$estNativeCounts@x, +# dims = c(totalGenes, nC), +# dimnames = list(geneNames, allCellNames)) +# resBat[cbind(which(noneEmptyGeneIndex), seq(nC))] <- returnResult$resList$estNativeCounts + + estRmat <- calculateNativeMatrix( + counts = counts, + native_counts = estRmat, + theta = returnResult$resList$theta, + eta = returnResult$resList$eta, + row_index = which(noneEmptyGeneIndex), + col_index = seq(totalCells), + phi = returnResult$resList$phi, + z = as.integer(returnResult$runParams$z), + pseudocount = 1e-20) + returnResult$resList$estNativeCounts <- estRmat +# } } endTime <- Sys.time() @@ -701,7 +733,9 @@ decontX <- function(counts, "estNativeCounts" = nextDecon$estRmat, "estConp" = resConp, "theta" = theta, - "delta" = delta + "delta" = delta, + "phi" = phi, + "eta" = eta ) return(list( diff --git a/src/DecontX.cpp b/src/DecontX.cpp index ce54be7a..e4a57f12 100644 --- a/src/DecontX.cpp +++ b/src/DecontX.cpp @@ -218,3 +218,71 @@ Rcpp::List decontXInitialize(const arma::sp_mat& counts, } + + + +// [[Rcpp::export]] +arma::sp_mat calculateNativeMatrix(const arma::sp_mat& counts, + const arma::sp_mat& native_counts, + const NumericVector& theta, + const NumericMatrix& eta, + const NumericMatrix& phi, + const IntegerVector& z, + const IntegerVector row_index, + const IntegerVector col_index, + const double& pseudocount) { + + // Perform error checking + if(counts.n_cols != theta.size()) { + stop("Length of 'theta' must be equal to the number of columns in 'counts'."); + } + if(counts.n_cols != z.size()) { + stop("Length of 'z' must be equal to the number of columns in 'counts'."); + } + if(counts.n_rows != phi.nrow()) { + stop("The number of rows in 'phi' must be equal to the number of rows in 'counts'."); + } + if(counts.n_rows != eta.nrow()) { + stop("The number of rows in 'eta' must be equal to the number of rows in 'counts'."); + } + if(phi.ncol() != eta.ncol()) { + stop("The number of columns in 'eta' must be equal to the number of columns in 'phi'."); + } + if(min(z) < 1 || max(z) > eta.ncol()) { + stop("The entries in 'z' need to be between 1 and the number of columns in eta and phi."); + } + if(max(row_index) - 1 > native_counts.n_rows || min(row_index) < 0) { + stop("The entries in 'row_index' need to be less than 'nrow(native_counts)' and greater than 0."); + } + if(max(col_index) - 1 > native_counts.n_cols || min(col_index) < 0) { + stop("The entries in 'row_index' need to be less than 'ncol(native_counts)' and greater than 0."); + } + + arma::sp_mat native_matrix = native_counts; + + int i; + int j; + int k; + double x; + double pcontamin; + double pnative; + double normp; + for (arma::sp_mat::const_iterator it = counts.begin(); it != counts.end(); ++it) { + i = it.row(); + j = it.col(); + x = *it; + k = z[j] - 1; + + // Calculate variational probabilities + pnative = log(phi(i,k) + pseudocount) + log(theta(j) + pseudocount); + pcontamin = log(eta(i,k) + pseudocount) + log(1 - theta(j) + pseudocount); + + // Normalize probabilities and add to proper components + normp = exp(pnative) / (exp(pcontamin) + exp(pnative)); + native_matrix.at(row_index(i) - 1, col_index(j) - 1) = normp * x; + } + + return native_matrix; +} + + diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 15f89529..34d48cda 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -54,6 +54,25 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// calculateNativeMatrix +arma::sp_mat calculateNativeMatrix(const arma::sp_mat& counts, const arma::sp_mat& native_counts, const NumericVector& theta, const NumericMatrix& eta, const NumericMatrix& phi, const IntegerVector& z, const IntegerVector row_index, const IntegerVector col_index, const double& pseudocount); +RcppExport SEXP _celda_calculateNativeMatrix(SEXP countsSEXP, SEXP native_countsSEXP, SEXP thetaSEXP, SEXP etaSEXP, SEXP phiSEXP, SEXP zSEXP, SEXP row_indexSEXP, SEXP col_indexSEXP, SEXP pseudocountSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::sp_mat& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const arma::sp_mat& >::type native_counts(native_countsSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type theta(thetaSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type eta(etaSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type phi(phiSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type z(zSEXP); + Rcpp::traits::input_parameter< const IntegerVector >::type row_index(row_indexSEXP); + Rcpp::traits::input_parameter< const IntegerVector >::type col_index(col_indexSEXP); + Rcpp::traits::input_parameter< const double& >::type pseudocount(pseudocountSEXP); + rcpp_result_gen = Rcpp::wrap(calculateNativeMatrix(counts, native_counts, theta, eta, phi, z, row_index, col_index, pseudocount)); + return rcpp_result_gen; +END_RCPP +} // cG_calcGibbsProbY_Simple NumericVector cG_calcGibbsProbY_Simple(const IntegerMatrix counts, IntegerVector nGbyTS, IntegerMatrix nTSbyC, IntegerVector nbyTS, IntegerVector nbyG, const IntegerVector y, const int L, const int index, const double gamma, const double beta, const double delta); RcppExport SEXP _celda_cG_calcGibbsProbY_Simple(SEXP countsSEXP, SEXP nGbyTSSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP indexSEXP, SEXP gammaSEXP, SEXP betaSEXP, SEXP deltaSEXP) { @@ -205,6 +224,7 @@ static const R_CallMethodDef CallEntries[] = { {"_celda_decontXEM", (DL_FUNC) &_celda_decontXEM, 7}, {"_celda_decontXLogLik", (DL_FUNC) &_celda_decontXLogLik, 6}, {"_celda_decontXInitialize", (DL_FUNC) &_celda_decontXInitialize, 4}, + {"_celda_calculateNativeMatrix", (DL_FUNC) &_celda_calculateNativeMatrix, 9}, {"_celda_cG_calcGibbsProbY_Simple", (DL_FUNC) &_celda_cG_calcGibbsProbY_Simple, 11}, {"_celda_cG_CalcGibbsProbY_ori", (DL_FUNC) &_celda_cG_CalcGibbsProbY_ori, 13}, {"_celda_cG_CalcGibbsProbY_fastRow", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fastRow, 13}, From 9dc93f129779d787f7caf216e2bb4e8d7192af5d Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Fri, 3 Jan 2020 21:30:00 -0500 Subject: [PATCH 112/149] More speed improvements to DecontX by referencing matrix values rather than copy with [] --- man/decontX.Rd | 2 +- src/DecontX.cpp | 26 +++++++++++++++++--------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/man/decontX.Rd b/man/decontX.Rd index e930b4f2..e019aacf 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -10,7 +10,7 @@ decontX( batch = NULL, maxIter = 200, delta = 10, - convergence = 0.01, + convergence = 0.001, logfile = NULL, verbose = TRUE, varGenes = NULL, diff --git a/src/DecontX.cpp b/src/DecontX.cpp index e4a57f12..c6f9cebe 100644 --- a/src/DecontX.cpp +++ b/src/DecontX.cpp @@ -2,6 +2,8 @@ // [[Rcpp::depends(RcppArmadillo)]] using namespace Rcpp; using namespace arma; + + // [[Rcpp::export]] Rcpp::List decontXEM(const arma::sp_mat& counts, const NumericVector& counts_colsums, @@ -49,7 +51,8 @@ Rcpp::List decontXEM(const arma::sp_mat& counts, int i; int j; - int k; + int k; + int nr = phi.nrow(); double x; double pcontamin; double pnative; @@ -61,12 +64,18 @@ Rcpp::List decontXEM(const arma::sp_mat& counts, x = *it; k = z[j] - 1; - // Calculate variational probabilities - pnative = log(phi(i,k) + pseudocount) + log(theta(j) + pseudocount); - pcontamin = log(eta(i,k) + pseudocount) + log(1 - theta(j) + pseudocount); + // Calculate variational probabilities + // Removing the log/exp speeds it up and produces the same result since + // there are only 2 probabilities being multiplied + + //pnative = log(phi(i,k) + pseudocount) + log(theta(j) + pseudocount); + //pcontamin = log(eta(i,k) + pseudocount) + log(1 - theta(j) + pseudocount); + pnative = (phi[nr * k + i] + pseudocount) * (theta[j] + pseudocount); + pcontamin = (eta[nr * k + i] + pseudocount) * (1 - theta[j] + pseudocount); // Normalize probabilities and add to proper components - normp = exp(pnative) / (exp(pcontamin) + exp(pnative)); + //normp = exp(pnative) / (exp(pcontamin) + exp(pnative)); + normp = pnative / (pcontamin + pnative); px = normp * x; new_phi(i,k) += px; native_total(j) += px; @@ -141,7 +150,8 @@ double decontXLogLik(const arma::sp_mat& counts, int i; int j; int k; - double x; + double x; + int nr = phi.nrow(); // Original R code: // ll <- sum(Matrix::t(counts) * log(theta * t(phi)[z, ] + @@ -153,7 +163,7 @@ double decontXLogLik(const arma::sp_mat& counts, x = *it; k = z[j] - 1; - loglik += x * log((phi(i,k) * theta(j)) + (eta(i,k) * (1 - theta(j))) + pseudocount); + loglik += x * log((phi[nr * k + i] * theta[j]) + (eta[nr * k + i] * (1 - theta[j])) + pseudocount); } return loglik; @@ -161,8 +171,6 @@ double decontXLogLik(const arma::sp_mat& counts, - - // [[Rcpp::export]] Rcpp::List decontXInitialize(const arma::sp_mat& counts, const NumericVector& theta, From 0289774a83615cea8cc4c935467928e8ff6118d5 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Fri, 3 Jan 2020 21:48:11 -0500 Subject: [PATCH 113/149] Updated decontX test units --- tests/testthat/test-decon.R | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/tests/testthat/test-decon.R b/tests/testthat/test-decon.R index 0ce35af2..936cf5b5 100644 --- a/tests/testthat/test-decon.R +++ b/tests/testthat/test-decon.R @@ -3,14 +3,9 @@ library(celda) context("Testing Deconx") deconSim <- simulateContaminatedMatrix(K = 10, delta = c(1, 5)) -modelDecontXoneBatch <- .decontXoneBatch(deconSim$observedCounts, +modelDecontXoneBatch <- .decontX(deconSim$observedCounts, z = deconSim$z, maxIter = 2) -modelDecontXoneBatchIter1 <- .decontXoneBatch(deconSim$observedCounts, - z = deconSim$z, - maxIter = 1) -modelDecontXoneBatchbg <- decontX(deconSim$observedCounts, - maxIter = 2) deconSim2 <- simulateContaminatedMatrix(K = 10, delta = 5) batchDecontX <- decontX(cbind(deconSim$observedCounts, @@ -18,10 +13,6 @@ batchDecontX <- decontX(cbind(deconSim$observedCounts, z = c(deconSim$z, deconSim2$z), batch = rep(seq(2), each = ncol(deconSim$observedCounts)), maxIter = 2) -batchDecontXBg <- decontX(cbind(deconSim$observedCounts, - deconSim2$observedCounts), - batch = rep(seq(2), each = ncol(deconSim$observedCounts)), - maxIter = 2) ## simulateContaminatedMatrix test_that(desc = "Testing simulateContaminatedMatrix", { @@ -38,18 +29,15 @@ test_that(desc = "Testing simulateContaminatedMatrix", { ## .decontXoneBatch test_that(desc = "Testing .decontXoneBatch", { - expect_equal(modelDecontXoneBatch$resList$estConp, - 1 - colSums(modelDecontXoneBatch$resList$estNativeCounts) / - colSums(deconSim$observedCounts)) - expect_error(.decontXoneBatch(counts = deconSim$observedCounts, + expect_error(decontX(counts = deconSim$observedCounts, z = deconSim$z, delta = -1), "'delta' should be a single positive value.") - expect_error(.decontXoneBatch(counts = deconSim$observedCounts, + expect_error(decontX(counts = deconSim$observedCounts, z = deconSim$z, delta = c(1, 1)), "'delta' should be a single positive value.") - expect_error(.decontXoneBatch(counts = deconSim$observedCounts, + expect_error(decontX(counts = deconSim$observedCounts, z = c(deconSim$z, 1)), paste0("'z' must be of the same length as the number of cells in the", " 'counts' matrix.")) @@ -63,12 +51,6 @@ test_that(desc = "Testing .decontXoneBatch", { "Missing value in 'counts' matrix.") }) -test_that(desc = "Testing .decontXoneBatch using background distribution", { - expect_equal( - modelDecontXoneBatchbg$resList$estConp, - 1 - colSums(modelDecontXoneBatchbg$resList$estNativeCounts) / - deconSim$NByC) -}) ## logLikelihood #test_that(desc = "Testing logLikelihood.DecontXoneBatch", { From 565d86d095f5ab48183ec620217b01201f89f8b1 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Fri, 3 Jan 2020 22:23:17 -0500 Subject: [PATCH 114/149] Updated call to Scran to use modelGeneVar to find top most variable genes. UMAP is now returned. --- R/decon.R | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/R/decon.R b/R/decon.R index 74d3d338..e72846e1 100644 --- a/R/decon.R +++ b/R/decon.R @@ -561,6 +561,7 @@ decontX <- function(counts, nC <- ncol(counts) deconMethod <- "clustering" + umap <- NULL if (is.null(z)) { .logMessages( date(), @@ -576,11 +577,16 @@ decontX <- function(counts, dbscanEps <- .processdbscanEps(dbscanEps) L <- .processL(L) - z <- .decontxInitializeZ(object = counts, + celda.init <- .decontxInitializeZ(object = counts, varGenes = varGenes, dbscanEps = dbscanEps, verbose = verbose, logfile = logfile) + z <- celda.init$z + umap <- celda.init$umap + colnames(umap) <- c("DecontX_UMAP_1", + "DecontX_UMAP_2") + rownames(umap) <- colnames(counts) } z <- .processCellLabels(z, numCells = nC) @@ -727,7 +733,10 @@ decontX <- function(counts, runParams <- list("deltaInit" = deltaInit, "iteration" = iter - 1L, "z" = z) - + if(!is.null(umap)) { + runParams[["UMAP"]] <- umap + } + resList <- list( "logLikelihood" = ll, "estNativeCounts" = nextDecon$estRmat, @@ -828,10 +837,14 @@ addLogLikelihood <- function(llA, llB) { } else if (nrow(sce) > varGenes) { ## Use the top most variable genes to do rough clustering ## (celda_CG & Louvian graph algorithm) - mvTrend <- scran::trendVar(sce, use.spikes = FALSE) - decomposeTrend <- scran::decomposeVar(sce, mvTrend) - topVariableGenes <- order(decomposeTrend$bio, - decreasing = TRUE)[seq(varGenes)] + #mvTrend <- scran::trendVar(sce, use.spikes = FALSE) + #decomposeTrend <- scran::decomposeVar(sce, mvTrend) + #topVariableGenes <- order(decomposeTrend$bio, + # decreasing = TRUE)[seq(varGenes)] + + sce.var <- scran::modelGeneVar(sce) + topVariableGenes <- order(sce.var$bio, + decreasing = TRUE)[seq(varGenes)] } countsFiltered <- as.matrix(SingleCellExperiment::counts( sce[topVariableGenes, ])) @@ -885,12 +898,13 @@ addLogLikelihood <- function(llA, llB) { # Use dbSCAN on the UMAP to identify broad cell types totalClusters <- 1 while (totalClusters <= 1 & dbscanEps > 0) { - resDbscan <- dbscan::dbscan(resUmap, dbscanEps) - dbscanEps <- dbscanEps - (0.25 * dbscanEps) - totalClusters <- length(unique(resDbscan$cluster)) + resDbscan <- dbscan::dbscan(resUmap, dbscanEps) + dbscanEps <- dbscanEps - (0.25 * dbscanEps) + totalClusters <- length(unique(resDbscan$cluster)) } - return("z" = resDbscan$cluster) + return(list("z" = resDbscan$cluster, + "umap" = resUmap)) } From 50796b337b1b6b2d851f07ed93fbd3646fabcc44 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Fri, 3 Jan 2020 22:30:56 -0500 Subject: [PATCH 115/149] Fix lints --- R/decon.R | 70 +++++++++++++++++++++---------------------------------- 1 file changed, 26 insertions(+), 44 deletions(-) diff --git a/R/decon.R b/R/decon.R index e72846e1..c6c2ad36 100644 --- a/R/decon.R +++ b/R/decon.R @@ -184,12 +184,12 @@ simulateContaminatedMatrix <- function(C = 300, Pc.e <- exp(logPc) Pr <- Pr.e / (Pr.e + Pc.e) - estRmat <- t(Pr) * counts + estRmat <- t(Pr) * counts rnGByK <- .colSumByGroupNumeric(estRmat, z, K) cnGByK <- rowSums(rnGByK) - rnGByK - counts.cs = colSums(counts) - estRmat.cs <- colSums(estRmat) + counts.cs <- colSums(counts) + estRmat.cs <- colSums(estRmat) estRmat.cs.n <- estRmat.cs / counts.cs estCmat.cs.n <- 1 - estRmat.cs.n temp <- cbind(estRmat.cs.n, estCmat.cs.n) @@ -305,14 +305,14 @@ decontX <- function(counts, batch = NULL, maxIter = 200, delta = 10, - convergence = 0.001, + convergence = 0.001, logfile = NULL, verbose = TRUE, varGenes = NULL, L = NULL, dbscanEps = NULL, seed = 12345) { - + if (is.null(seed)) { res <- .decontX(counts = counts, z = z, @@ -349,7 +349,7 @@ decontX <- function(counts, batch = NULL, maxIter = 200, delta = 10, - convergence = 0.001, + convergence = 0.001, logfile = NULL, verbose = TRUE, varGenes = NULL, @@ -372,9 +372,9 @@ decontX <- function(counts, # Convert to sparse matrix # After Celda can run on sparse matrix, - # then we can just have this be required - # as input - counts = as(counts, "dgCMatrix") + # then we can just have this be required + # as input + counts <- as(counts, "dgCMatrix") ## Empty expression genes won't be used for estimation haveEmptyGenes <- FALSE @@ -397,11 +397,11 @@ decontX <- function(counts, sparse = TRUE, dimnames = list(geneNames, allCellNames) ) - + if (!is.null(batch)) { ## Set result lists upfront for all cells from different batches logLikelihood <- c() - + theta <- rep(NA, nC) estConp <- rep(NA, nC) returnZ <- rep(NA, nC) @@ -437,13 +437,6 @@ decontX <- function(counts, L = L ) -# if (haveEmptyGenes) { -# estRmat[cbind(which(noneEmptyGeneIndex), which(batch == bat))] <- -# resBat$resList$estNativeCounts -# } else { -# estRmat[cbind(seq(nrow(counts)), which(batch == bat))] <- -# resBat$resList$estNativeCounts -# } estRmat <- calculateNativeMatrix( counts = countsBat, native_counts = estRmat, @@ -454,7 +447,7 @@ decontX <- function(counts, phi = resBat$resList$phi, z = as.integer(resBat$runParams$z), pseudocount = 1e-20) - + estConp[batch == bat] <- resBat$resList$estConp theta[batch == bat] <- resBat$resList$theta returnZ[batch == bat] <- resBat$runParams$z @@ -497,16 +490,6 @@ decontX <- function(counts, dbscanEps = dbscanEps, L = L ) -# if (haveEmptyGenes) { -# resBat <- matrix(0, nrow = totalGenes, ncol = nC, -# dimnames = list(geneNames, allCellNames)) -# ix <- rep(which(noneEmptyGeneIndex), nC) -# jx <- rep(seq(nC), sum(noneEmptyGeneIndex)) -# resBat <- sparseMatrix(i=ix, j=jx, -# x = returnResult$resList$estNativeCounts@x, -# dims = c(totalGenes, nC), -# dimnames = list(geneNames, allCellNames)) -# resBat[cbind(which(noneEmptyGeneIndex), seq(nC))] <- returnResult$resList$estNativeCounts estRmat <- calculateNativeMatrix( counts = counts, @@ -519,7 +502,6 @@ decontX <- function(counts, z = as.integer(returnResult$runParams$z), pseudocount = 1e-20) returnResult$resList$estNativeCounts <- estRmat -# } } endTime <- Sys.time() @@ -597,13 +579,13 @@ decontX <- function(counts, stopIter <- 3L .logMessages( - date(), + date(), ".. Estimating contamination", logfile = logfile, append = TRUE, verbose = verbose ) - + if (deconMethod == "clustering") { ## Initialization deltaInit <- delta @@ -619,7 +601,7 @@ decontX <- function(counts, pseudocount = 1e-20) phi <- nextDecon$phi eta <- nextDecon$eta - + # estRmat <- Matrix::t(Matrix::t(counts) * theta) # phi <- .colSumByGroupNumeric(as.matrix(estRmat), z, K) # eta <- rowSums(phi) - phi @@ -650,8 +632,8 @@ decontX <- function(counts, ## EM updates theta.previous <- theta converged <- FALSE - counts.colsums = Matrix::colSums(counts) - while (iter <= maxIter & !isTRUE(converged) & + counts.colsums <- Matrix::colSums(counts) + while (iter <= maxIter & !isTRUE(converged) & numIterWithoutImprovement <= stopIter) { # nextDecon <- .cDCalcEMDecontamination( # counts = counts, @@ -701,7 +683,7 @@ decontX <- function(counts, } max.divergence <- max(abs(theta.previous - theta)) - if(max.divergence < convergence) { + if (max.divergence < convergence) { converged <- TRUE } theta.previous <- theta @@ -714,8 +696,8 @@ decontX <- function(counts, logfile = logfile, append = TRUE, verbose = verbose) - - iter <- iter + 1L + + iter <- iter + 1L } } @@ -723,13 +705,13 @@ decontX <- function(counts, # resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) resConp <- nextDecon$contamination names(resConp) <- colnames(counts) - + if (!is.null(batch)) { batchMessage <- paste(" ", "in batch ", batch, ".", sep = "") } else { batchMessage <- "." } - + runParams <- list("deltaInit" = deltaInit, "iteration" = iter - 1L, "z" = z) @@ -743,7 +725,7 @@ decontX <- function(counts, "estConp" = resConp, "theta" = theta, "delta" = delta, - "phi" = phi, + "phi" = phi, "eta" = eta ) @@ -873,7 +855,7 @@ addLogLikelihood <- function(llA, llB) { } .logMessages( - date(), + date(), ".... Reducing dimensionality with UMAP", logfile = logfile, append = TRUE, @@ -886,11 +868,11 @@ addLogLikelihood <- function(llA, llB) { rm(fm) .logMessages( - date(), + date(), " .... Determining cell clusters with DBSCAN (Eps=", dbscanEps, ")", - sep="", + sep = "", logfile = logfile, append = TRUE, verbose = verbose From bb1219438178ddabe4caffcf6048b4ec6e61892c Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Fri, 3 Jan 2020 23:12:49 -0500 Subject: [PATCH 116/149] Updated decontX documentation --- R/decon.R | 76 ++++++++++++++++++++++++-------------------------- man/decontX.Rd | 68 +++++++++++++++++++++----------------------- 2 files changed, 68 insertions(+), 76 deletions(-) diff --git a/R/decon.R b/R/decon.R index c6c2ad36..a778820e 100644 --- a/R/decon.R +++ b/R/decon.R @@ -257,60 +257,56 @@ simulateContaminatedMatrix <- function(C = 300, } -#' @title Decontaminate count matrix -#' @description This function updates decontamination on dataset with multiple -#' batches. -#' @param counts Numeric/Integer matrix. Observed count matrix, rows represent +#' @title DecontX +#' @description Identifies contamination in single cell datasets. +#' @param counts Numeric matrix. Observed count matrix, rows represent #' features and columns represent cells. -#' @param z Integer vector. Cell population labels. Default NULL. -#' @param batch Integer vector. Cell batch labels. Default NULL. -#' @param maxIter Integer. Maximum iterations of EM algorithm. Default to be -#' 200. -#' @param delta Numeric. Symmetric concentration parameter for Theta. Default -#' to be 10. +#' @param z Integer vector. Cell cluster labels. If NULL, Celda will be used +#' to reduce the dimensionality of the dataset to 'L' modules, \link[uwot]{umap} +#' will be used to further reduce the dataset to 2 dimenions and +#' \link[dbscan]{dbscan} will be used to identify clusters of broad cell types. +#' Default NULL. +#' @param batch Integer vector. Batch labels for cells. If batch labels +#' are supplied, DecontX is run on cells from each batch separately. +#' Default NULL. +#' @param maxIter Integer. Maximum iterations of the EM algorithm. Default 500. +#' @param delta Numeric. Symmetric Dirichlet concentration parameter +#' for Theta. Default 10. #' @param logfile Character. Messages will be redirected to a file named #' `logfile`. If NULL, messages will be printed to stdout. Default NULL. #' @param verbose Logical. Whether to print log messages. Default TRUE. -#' @param varGenes Positive Integer. Used only when z is not provided. -#' Need to be larger than 1. Default value is 5000 if not provided. -#' varGenes, being the number of most variable genes, is used to filter genes -#' based on the variability of gene's expression cross cells. While the -#' variability is calcualted using scran::trendVar() and scran::decomposeVar(). -#' @param L Positive Integer. Used only when z is not provided. -#' Need to be larger than 1. Default value is 50 if not provided. -#' L, being the number of gene modules, is used on celda_CG clustering -#' to collapse genes into gene modules. -#' @param dbscanEps Numeric. Used only when z is not provided. -#' Need to be non-negative. Default is 1.0 if not provided. -#' dbscanEps is the clustering resolution parameter that is used to feed into -#' dbscan::dbscan() to estimate broad cell clusters. +#' @param varGenes Integer. The number of variable genes to use in +#' Celda clustering. Variability is calcualted using \link[scran]{modelGeneVar}. +#' Used only when z is not provided. Default 5000. +#' @param L Integer. Number of modules for Celda clustering. Used to reduce +#' the dimensionality of the dataset before applying UMAP and dbscan. +#' Used only when z is not provided. Default 50. +#' @param dbscanEps Numeric. The clustering resolution parameter +#' used in \link[dbscan]{dbscan} to estimate broad cell clusters. +#' Used only when z is not provided. Default 1. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. #' @return A list object which contains the decontaminated count matrix and #' related parameters. #' @examples -#' data(contaminationSim) -#' deconC <- decontX( -#' counts = contaminationSim$rmat + contaminationSim$cmat, -#' z = contaminationSim$z, maxIter = 3 -#' ) -#' deconBg <- decontX( -#' counts = contaminationSim$rmat + contaminationSim$cmat, -#' maxIter = 3 -#' ) +#' s <- simulateContaminatedMatrix() +#' res <- decontX(s$observedCounts, s$z) +#' contamination <- colSums(s$observedCounts - s$nativeCounts) / +#' colSums(s$observedCounts) +#' plot(contamination, res$resList$estConp) #' @export decontX <- function(counts, z = NULL, batch = NULL, - maxIter = 200, + maxIter = 500, delta = 10, convergence = 0.001, logfile = NULL, verbose = TRUE, - varGenes = NULL, - L = NULL, - dbscanEps = NULL, + varGenes = 5000, + L = 50, + dbscanEps = 1, seed = 12345) { if (is.null(seed)) { @@ -567,7 +563,7 @@ decontX <- function(counts, z <- celda.init$z umap <- celda.init$umap colnames(umap) <- c("DecontX_UMAP_1", - "DecontX_UMAP_2") + "DecontX_UMAP_2") rownames(umap) <- colnames(counts) } @@ -715,10 +711,10 @@ decontX <- function(counts, runParams <- list("deltaInit" = deltaInit, "iteration" = iter - 1L, "z" = z) - if(!is.null(umap)) { + if (!is.null(umap)) { runParams[["UMAP"]] <- umap } - + resList <- list( "logLikelihood" = ll, "estNativeCounts" = nextDecon$estRmat, @@ -823,7 +819,7 @@ addLogLikelihood <- function(llA, llB) { #decomposeTrend <- scran::decomposeVar(sce, mvTrend) #topVariableGenes <- order(decomposeTrend$bio, # decreasing = TRUE)[seq(varGenes)] - + sce.var <- scran::modelGeneVar(sce) topVariableGenes <- order(sce.var$bio, decreasing = TRUE)[seq(varGenes)] diff --git a/man/decontX.Rd b/man/decontX.Rd index e019aacf..0d13b07e 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -2,57 +2,58 @@ % Please edit documentation in R/decon.R \name{decontX} \alias{decontX} -\title{Decontaminate count matrix} +\title{DecontX} \usage{ decontX( counts, z = NULL, batch = NULL, - maxIter = 200, + maxIter = 500, delta = 10, convergence = 0.001, logfile = NULL, verbose = TRUE, - varGenes = NULL, - L = NULL, - dbscanEps = NULL, + varGenes = 5000, + L = 50, + dbscanEps = 1, seed = 12345 ) } \arguments{ -\item{counts}{Numeric/Integer matrix. Observed count matrix, rows represent +\item{counts}{Numeric matrix. Observed count matrix, rows represent features and columns represent cells.} -\item{z}{Integer vector. Cell population labels. Default NULL.} +\item{z}{Integer vector. Cell cluster labels. If NULL, Celda will be used +to reduce the dimensionality of the dataset to 'L' modules, \link[uwot]{umap} +will be used to further reduce the dataset to 2 dimenions and +\link[dbscan]{dbscan} will be used to identify clusters of broad cell types. +Default NULL.} -\item{batch}{Integer vector. Cell batch labels. Default NULL.} +\item{batch}{Integer vector. Batch labels for cells. If batch labels +are supplied, DecontX is run on cells from each batch separately. +Default NULL.} -\item{maxIter}{Integer. Maximum iterations of EM algorithm. Default to be -200.} +\item{maxIter}{Integer. Maximum iterations of the EM algorithm. Default 500.} -\item{delta}{Numeric. Symmetric concentration parameter for Theta. Default -to be 10.} +\item{delta}{Numeric. Symmetric Dirichlet concentration parameter +for Theta. Default 10.} \item{logfile}{Character. Messages will be redirected to a file named `logfile`. If NULL, messages will be printed to stdout. Default NULL.} \item{verbose}{Logical. Whether to print log messages. Default TRUE.} -\item{varGenes}{Positive Integer. Used only when z is not provided. -Need to be larger than 1. Default value is 5000 if not provided. -varGenes, being the number of most variable genes, is used to filter genes -based on the variability of gene's expression cross cells. While the -variability is calcualted using scran::trendVar() and scran::decomposeVar().} +\item{varGenes}{Integer. The number of variable genes to use in +Celda clustering. Variability is calcualted using \link[scran]{modelGeneVar}. +Used only when z is not provided. Default 5000.} -\item{L}{Positive Integer. Used only when z is not provided. -Need to be larger than 1. Default value is 50 if not provided. -L, being the number of gene modules, is used on celda_CG clustering -to collapse genes into gene modules.} +\item{L}{Integer. Number of modules for Celda clustering. Used to reduce +the dimensionality of the dataset before applying UMAP and dbscan. +Used only when z is not provided. Default 50.} -\item{dbscanEps}{Numeric. Used only when z is not provided. -Need to be non-negative. Default is 1.0 if not provided. -dbscanEps is the clustering resolution parameter that is used to feed into -dbscan::dbscan() to estimate broad cell clusters.} +\item{dbscanEps}{Numeric. The clustering resolution parameter +used in \link[dbscan]{dbscan} to estimate broad cell clusters. +Used only when z is not provided. Default 1.} \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to @@ -63,17 +64,12 @@ A list object which contains the decontaminated count matrix and related parameters. } \description{ -This function updates decontamination on dataset with multiple - batches. +Identifies contamination in single cell datasets. } \examples{ -data(contaminationSim) -deconC <- decontX( - counts = contaminationSim$rmat + contaminationSim$cmat, - z = contaminationSim$z, maxIter = 3 -) -deconBg <- decontX( - counts = contaminationSim$rmat + contaminationSim$cmat, - maxIter = 3 -) + s <- simulateContaminatedMatrix() + res <- decontX(s$observedCounts, s$z) + contamination <- colSums(s$observedCounts - s$nativeCounts) / + colSums(s$observedCounts) + plot(contamination, res$resList$estConp) } From 36d265208c2264586f4591e35baab6b8dadc4d88 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Fri, 3 Jan 2020 23:43:32 -0500 Subject: [PATCH 117/149] Changed output names in results list --- R/decon.R | 41 +++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/R/decon.R b/R/decon.R index a778820e..92e8cfb8 100644 --- a/R/decon.R +++ b/R/decon.R @@ -461,17 +461,13 @@ decontX <- function(counts, ## So update z in the final returned result runParams$z <- returnZ method <- resBat$method - resList <- list( - "logLikelihood" = logLikelihood, - "estNativeCounts" = estRmat, - "estConp" = estConp, - "theta" = theta - ) returnResult <- list( "runParams" = runParams, - "resList" = resList, - "method" = method + "logLikelihood" = logLikelihood, + "decontX_counts" = estRmat, + "contamination" = estConp, + "theta" = theta ) } else { ## When there is only one batch returnResult <- .decontXoneBatch( @@ -490,14 +486,15 @@ decontX <- function(counts, estRmat <- calculateNativeMatrix( counts = counts, native_counts = estRmat, - theta = returnResult$resList$theta, - eta = returnResult$resList$eta, + theta = returnResult$theta, + eta = returnResult$eta, row_index = which(noneEmptyGeneIndex), col_index = seq(totalCells), - phi = returnResult$resList$phi, + phi = returnResult$phi, z = as.integer(returnResult$runParams$z), pseudocount = 1e-20) - returnResult$resList$estNativeCounts <- estRmat + + returnResult$decontX_counts <- estRmat } endTime <- Sys.time() @@ -539,6 +536,7 @@ decontX <- function(counts, nC <- ncol(counts) deconMethod <- "clustering" + ## Generate cell cluster labels if none are provided umap <- NULL if (is.null(z)) { .logMessages( @@ -715,19 +713,14 @@ decontX <- function(counts, runParams[["UMAP"]] <- umap } - resList <- list( + return(list( + "runParams" = runParams, "logLikelihood" = ll, - "estNativeCounts" = nextDecon$estRmat, - "estConp" = resConp, + "contamination" = resConp, "theta" = theta, "delta" = delta, "phi" = phi, "eta" = eta - ) - - return(list( - "runParams" = runParams, - "resList" = resList )) } @@ -891,8 +884,8 @@ addLogLikelihood <- function(llA, llB) { if (is.null(varGenes)) { varGenes <- 5000 } else { - if (varGenes < 2 | !is.integer(varGenes)) { - stop("Parameter 'varGenes' must be an integer and larger than 1.") + if (varGenes < 2 | length(varGenes) > 1) { + stop("Parameter 'varGenes' must be an integer larger than 1.") } } return(varGenes) @@ -915,8 +908,8 @@ addLogLikelihood <- function(llA, llB) { if (is.null(L)) { L <- 50 } else { - if (L < 2 | !is.integer(L)) { - stop("Parameter 'L' must be an integer and larger than 1.") + if (L < 2 | length(L) > 1) { + stop("Parameter 'L' must be an integer larger than 1.") } } return(L) From 4be3411dc86c296b67b85556f51718645b133e5d Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Fri, 3 Jan 2020 23:45:57 -0500 Subject: [PATCH 118/149] Updated DecontX example --- R/decon.R | 4 ++-- man/decontX.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/decon.R b/R/decon.R index 92e8cfb8..1c5d1658 100644 --- a/R/decon.R +++ b/R/decon.R @@ -291,10 +291,10 @@ simulateContaminatedMatrix <- function(C = 300, #' related parameters. #' @examples #' s <- simulateContaminatedMatrix() -#' res <- decontX(s$observedCounts, s$z) +#' result <- decontX(s$observedCounts, s$z) #' contamination <- colSums(s$observedCounts - s$nativeCounts) / #' colSums(s$observedCounts) -#' plot(contamination, res$resList$estConp) +#' plot(contamination, result$contamination) #' @export decontX <- function(counts, z = NULL, diff --git a/man/decontX.Rd b/man/decontX.Rd index 0d13b07e..41a71c57 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -68,8 +68,8 @@ Identifies contamination in single cell datasets. } \examples{ s <- simulateContaminatedMatrix() - res <- decontX(s$observedCounts, s$z) + result <- decontX(s$observedCounts, s$z) contamination <- colSums(s$observedCounts - s$nativeCounts) / colSums(s$observedCounts) - plot(contamination, res$resList$estConp) + plot(contamination, result$contamination) } From 865abd27ec231b1cf34b4f4172c7f6125aab7b6f Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Sat, 4 Jan 2020 01:00:10 -0500 Subject: [PATCH 119/149] Updated names of results list in DecontX. Included estimated distributions for each batch. Updated documentation --- R/decon.R | 279 ++++++++++++++++++++++++++++--------------------- man/decontX.Rd | 7 +- 2 files changed, 163 insertions(+), 123 deletions(-) diff --git a/R/decon.R b/R/decon.R index 1c5d1658..f0441998 100644 --- a/R/decon.R +++ b/R/decon.R @@ -272,9 +272,6 @@ simulateContaminatedMatrix <- function(C = 300, #' @param maxIter Integer. Maximum iterations of the EM algorithm. Default 500. #' @param delta Numeric. Symmetric Dirichlet concentration parameter #' for Theta. Default 10. -#' @param logfile Character. Messages will be redirected to a file named -#' `logfile`. If NULL, messages will be printed to stdout. Default NULL. -#' @param verbose Logical. Whether to print log messages. Default TRUE. #' @param varGenes Integer. The number of variable genes to use in #' Celda clustering. Variability is calcualted using \link[scran]{modelGeneVar}. #' Used only when z is not provided. Default 5000. @@ -287,8 +284,15 @@ simulateContaminatedMatrix <- function(C = 300, #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. -#' @return A list object which contains the decontaminated count matrix and -#' related parameters. +#' @param logfile Character. Messages will be redirected to a file named +#' `logfile`. If NULL, messages will be printed to stdout. Default NULL. +#' @param verbose Logical. Whether to print log messages. Default TRUE. +#' @return List. 'decontX_counts' contains the decontaminated count matrix +#' 'contamination' contains the per-cell contamination estimates. +#' 'batchEstimates' contains the estimated probability distributions +#' for each batch. 'z' contains the cell cluster labels. 'runParams' +#' contains a list of arguments used in this function. +#' #' @examples #' s <- simulateContaminatedMatrix() #' result <- decontX(s$observedCounts, s$z) @@ -366,6 +370,17 @@ decontX <- function(counts, append = TRUE, verbose = verbose) + runParams <- list( + batch = batch, + maxIter = maxIter, + delta = delta, + convergence = convergence, + varGenes = varGenes, + L = L, + dbscanEps = dbscanEps, + logfile = logfile, + verbose = verbose) + # Convert to sparse matrix # After Celda can run on sparse matrix, # then we can just have this be required @@ -386,6 +401,7 @@ decontX <- function(counts, nC <- ncol(counts) allCellNames <- colnames(counts) + ## Set up final deconaminated matrix estRmat <- Matrix::Matrix( data = 0, ncol = totalCells, @@ -394,108 +410,141 @@ decontX <- function(counts, dimnames = list(geneNames, allCellNames) ) - if (!is.null(batch)) { - ## Set result lists upfront for all cells from different batches - logLikelihood <- c() - - theta <- rep(NA, nC) - estConp <- rep(NA, nC) - returnZ <- rep(NA, nC) - - batchIndex <- unique(batch) - - for (bat in batchIndex) { - .logMessages( - date(), - ".. Analyzing cells in batch", - bat, - logfile = logfile, - append = TRUE, - verbose = verbose - ) - - zBat <- NULL - countsBat <- counts[, batch == bat] - if (!is.null(z)) { - zBat <- z[batch == bat] - } - resBat <- .decontXoneBatch( - counts = countsBat, - z = zBat, - batch = bat, - maxIter = maxIter, - delta = delta, - convergence = convergence, - logfile = logfile, - verbose = verbose, - varGenes = varGenes, - dbscanEps = dbscanEps, - L = L - ) - - estRmat <- calculateNativeMatrix( - counts = countsBat, - native_counts = estRmat, - theta = resBat$resList$theta, - eta = resBat$resList$eta, - row_index = which(noneEmptyGeneIndex), - col_index = which(batch == bat), - phi = resBat$resList$phi, - z = as.integer(resBat$runParams$z), - pseudocount = 1e-20) - - estConp[batch == bat] <- resBat$resList$estConp - theta[batch == bat] <- resBat$resList$theta - returnZ[batch == bat] <- resBat$runParams$z - - if (is.null(logLikelihood)) { - logLikelihood <- resBat$resList$logLikelihood - } else { - logLikelihood <- addLogLikelihood(logLikelihood, - resBat$resList$logLikelihood) - } - } - - runParams <- resBat$runParams - ## All batches share the same other parameters except cluster label z - ## So update z in the final returned result - runParams$z <- returnZ - method <- resBat$method - - returnResult <- list( - "runParams" = runParams, - "logLikelihood" = logLikelihood, - "decontX_counts" = estRmat, - "contamination" = estConp, - "theta" = theta - ) - } else { ## When there is only one batch - returnResult <- .decontXoneBatch( - counts = counts, - z = z, - maxIter = maxIter, - delta = delta, - convergence = convergence, - logfile = logfile, - verbose = verbose, - varGenes = varGenes, - dbscanEps = dbscanEps, - L = L - ) + ## Generate batch labels if none were supplied + if (is.null(batch)) { + batch <- rep("all", nC) + } + batchIndex <- unique(batch) + + ## Set result lists upfront for all cells from different batches + logLikelihood <- c() + estConp <- rep(NA, nC) + returnZ <- rep(NA, nC) + resBatch <- list() + + ## Cycle through each sample/batch and run DecontX + for (bat in batchIndex) { + + if(length(batchIndex) == 1) { + .logMessages( + date(), + ".. Analyzing all cells", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + } else { + .logMessages( + date(), + ".. Analyzing cells in batch", + bat, + logfile = logfile, + append = TRUE, + verbose = verbose + ) + } + + zBat <- NULL + countsBat <- counts[, batch == bat] + if (!is.null(z)) { + zBat <- z[batch == bat] + } + res <- .decontXoneBatch( + counts = countsBat, + z = zBat, + batch = bat, + maxIter = maxIter, + delta = delta, + convergence = convergence, + logfile = logfile, + verbose = verbose, + varGenes = varGenes, + dbscanEps = dbscanEps, + L = L + ) + + estRmat <- calculateNativeMatrix( + counts = countsBat, + native_counts = estRmat, + theta = res$theta, + eta = res$eta, + row_index = which(noneEmptyGeneIndex), + col_index = which(batch == bat), + phi = res$phi, + z = as.integer(res$z), + pseudocount = 1e-20 + ) + + resBatch[[bat]] <- list( + z = res$z, + phi = res$phi, + eta = res$eta, + delta = res$delta, + theta = res$theta, + logLikelihood = res$logLikelihood, + UMAP = res$UMAP, + z = res$z, + iteration = res$iteration + ) + + estConp[batch == bat] <- res$contamination + if(length(batchIndex) > 1) { + returnZ[batch == bat] <- paste0(bat, "-", res$z) + } else { + returnZ[batch == bat] <- res$z + } + +# if (is.null(logLikelihood)) { +# logLikelihood <- resBat$resList$logLikelihood +# } else { +# logLikelihood <- addLogLikelihood(logLikelihood, +# resBat$resList$logLikelihood) +# } + } + names(resBatch) <- batchIndex + +# runParams <- res$runParams + ## All batches share the same other parameters except cluster label z + ## So update z in the final returned result +# runParams$z <- returnZ +# method <- res$method + + returnResult <- list( + "runParams" = runParams, + "batchEstimates" = resBatch, + "decontX_counts" = estRmat, + "contamination" = estConp, + "z" = returnZ + ) + + +# } else { ## When there is only one batch +# returnResult <- .decontXoneBatch( +# counts = counts, +# z = z, +# maxIter = maxIter, +# delta = delta, +# convergence = convergence, +# logfile = logfile, +# verbose = verbose, +# varGenes = varGenes, +# dbscanEps = dbscanEps, +# L = L +# ) - estRmat <- calculateNativeMatrix( - counts = counts, - native_counts = estRmat, - theta = returnResult$theta, - eta = returnResult$eta, - row_index = which(noneEmptyGeneIndex), - col_index = seq(totalCells), - phi = returnResult$phi, - z = as.integer(returnResult$runParams$z), - pseudocount = 1e-20) +# estRmat <- calculateNativeMatrix( +# counts = counts, +# native_counts = estRmat, +# theta = returnResult$theta, +# eta = returnResult$eta, +# row_index = which(noneEmptyGeneIndex), +# col_index = seq(totalCells), +# phi = returnResult$phi, +# z = as.integer(returnResult$runParams$z), +# pseudocount = 1e-20) - returnResult$decontX_counts <- estRmat - } +# returnResult$decontX_counts <- estRmat +# } endTime <- Sys.time() .logMessages(paste(rep("-", 50), collapse = ""), @@ -695,32 +744,20 @@ decontX <- function(counts, } } - # resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) resConp <- nextDecon$contamination names(resConp) <- colnames(counts) - if (!is.null(batch)) { - batchMessage <- paste(" ", "in batch ", batch, ".", sep = "") - } else { - batchMessage <- "." - } - - runParams <- list("deltaInit" = deltaInit, - "iteration" = iter - 1L, - "z" = z) - if (!is.null(umap)) { - runParams[["UMAP"]] <- umap - } - return(list( - "runParams" = runParams, "logLikelihood" = ll, "contamination" = resConp, "theta" = theta, "delta" = delta, "phi" = phi, - "eta" = eta + "eta" = eta, + "UMAP" = umap, + "iteration" = iter - 1L, + "z" = z )) } diff --git a/man/decontX.Rd b/man/decontX.Rd index 41a71c57..72e9feeb 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -60,8 +60,11 @@ a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} } \value{ -A list object which contains the decontaminated count matrix and - related parameters. +List. 'decontX_counts' contains the decontaminated count matrix +'contamination' contains the per-cell contamination estimates. +'batchEstimates' contains the estimated probability distributions +for each batch. 'z' contains the cell cluster labels. 'runParams' +contains a list of arguments used in this function. } \description{ Identifies contamination in single cell datasets. From 243c76015c9bf27a34dc13119dbe79f203145404 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Sat, 4 Jan 2020 01:38:52 -0500 Subject: [PATCH 120/149] Updated decontX docs. Added parameter to only check for convergence and calculate logLik every 10 steps --- R/decon.R | 88 +++++++++++++++++++++++++++++++------------------- man/decontX.Rd | 31 ++++++++++++------ 2 files changed, 75 insertions(+), 44 deletions(-) diff --git a/R/decon.R b/R/decon.R index f0441998..2e17c987 100644 --- a/R/decon.R +++ b/R/decon.R @@ -258,28 +258,36 @@ simulateContaminatedMatrix <- function(C = 300, #' @title DecontX -#' @description Identifies contamination in single cell datasets. +#' @description Identifies contamination from factors such as ambient RNA +#' in single cell genomic datasets. #' @param counts Numeric matrix. Observed count matrix, rows represent #' features and columns represent cells. #' @param z Integer vector. Cell cluster labels. If NULL, Celda will be used -#' to reduce the dimensionality of the dataset to 'L' modules, \link[uwot]{umap} -#' will be used to further reduce the dataset to 2 dimenions and -#' \link[dbscan]{dbscan} will be used to identify clusters of broad cell types. -#' Default NULL. +#' to reduce the dimensionality of the dataset to 'L' modules, +#' '\link[uwot]{umap}' from the 'uwot' package will be used to further +#' reduce the dataset to 2 dimenions and the '\link[dbscan]{dbscan}' +#' function from the 'dbscan' package will be used to identify clusters +#' of broad cell types. Default NULL. #' @param batch Integer vector. Batch labels for cells. If batch labels #' are supplied, DecontX is run on cells from each batch separately. #' Default NULL. #' @param maxIter Integer. Maximum iterations of the EM algorithm. Default 500. +#' @param convergence Numeric. The EM algorithm will be stopped if the maximum +#' difference in the contamination estimates between the previous 'convergenceCheck' +#' is less than this. Default 0.001. +#' @param convergenceCheck Integer. Check for convergence every +#' 'convergenceCheck' iterations. Default 10. #' @param delta Numeric. Symmetric Dirichlet concentration parameter -#' for Theta. Default 10. +#' to initialize theta. Default 10. #' @param varGenes Integer. The number of variable genes to use in -#' Celda clustering. Variability is calcualted using \link[scran]{modelGeneVar}. +#' Celda clustering. Variability is calcualted using '\link[scran]{modelGeneVar}' +#' function from the 'scran' package. #' Used only when z is not provided. Default 5000. #' @param L Integer. Number of modules for Celda clustering. Used to reduce #' the dimensionality of the dataset before applying UMAP and dbscan. #' Used only when z is not provided. Default 50. #' @param dbscanEps Numeric. The clustering resolution parameter -#' used in \link[dbscan]{dbscan} to estimate broad cell clusters. +#' used in '\link[dbscan]{dbscan}' to estimate broad cell clusters. #' Used only when z is not provided. Default 1. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to @@ -287,11 +295,11 @@ simulateContaminatedMatrix <- function(C = 300, #' @param logfile Character. Messages will be redirected to a file named #' `logfile`. If NULL, messages will be printed to stdout. Default NULL. #' @param verbose Logical. Whether to print log messages. Default TRUE. -#' @return List. 'decontX_counts' contains the decontaminated count matrix +#' @return 'decontX_counts' contains the decontaminated count matrix. #' 'contamination' contains the per-cell contamination estimates. #' 'batchEstimates' contains the estimated probability distributions #' for each batch. 'z' contains the cell cluster labels. 'runParams' -#' contains a list of arguments used in this function. +#' contains a list of arguments used in the function call. #' #' @examples #' s <- simulateContaminatedMatrix() @@ -306,6 +314,7 @@ decontX <- function(counts, maxIter = 500, delta = 10, convergence = 0.001, + convergenceStep = 10, logfile = NULL, verbose = TRUE, varGenes = 5000, @@ -320,6 +329,7 @@ decontX <- function(counts, maxIter = maxIter, delta = delta, convergence = convergence, + convergenceStep = convergenceStep, logfile = logfile, verbose = verbose, varGenes = varGenes, @@ -333,6 +343,7 @@ decontX <- function(counts, maxIter = maxIter, delta = delta, convergence = convergence, + convergenceStep = convergenceStep, logfile = logfile, verbose = verbose, varGenes = varGenes, @@ -350,6 +361,7 @@ decontX <- function(counts, maxIter = 200, delta = 10, convergence = 0.001, + convergenceStep = 10, logfile = NULL, verbose = TRUE, varGenes = NULL, @@ -456,6 +468,7 @@ decontX <- function(counts, maxIter = maxIter, delta = delta, convergence = convergence, + convergenceStep = convergenceStep, logfile = logfile, verbose = verbose, varGenes = varGenes, @@ -573,6 +586,7 @@ decontX <- function(counts, maxIter = 200, delta = 10, convergence = 0.01, + convergenceStep = 10, logfile = NULL, verbose = TRUE, varGenes = NULL, @@ -708,30 +722,34 @@ decontX <- function(counts, # eta = eta, # theta = theta # ) - llTemp <- decontXLogLik( - counts = counts, - z = z, - phi = phi, - eta = eta, - theta = theta, - pseudocount = 1e-20) - - ll <- c(ll, llTemp) - llRound <- c(llRound, round(llTemp, 2)) - - if (round(llTemp, 2) > llRound[iter] | iter == 1) { - numIterWithoutImprovement <- 1L - } else { - numIterWithoutImprovement <- numIterWithoutImprovement + 1L - } - max.divergence <- max(abs(theta.previous - theta)) - if (max.divergence < convergence) { - converged <- TRUE - } - theta.previous <- theta - - .logMessages(date(), + ## Calculate likelihood and check for convergence + if (iter %% convergenceStep == 0) { + + llTemp <- decontXLogLik( + counts = counts, + z = z, + phi = phi, + eta = eta, + theta = theta, + pseudocount = 1e-20) + + ll <- c(ll, llTemp) +# llRound <- c(llRound, round(llTemp, 2)) + +# if (round(llTemp, 2) > llRound[iter] | iter == 1) { +# numIterWithoutImprovement <- 1L +# } else { +# numIterWithoutImprovement <- numIterWithoutImprovement + 1L +# } + + max.divergence <- max(abs(theta.previous - theta)) + if (max.divergence < convergence) { + converged <- TRUE + } + theta.previous <- theta + + .logMessages(date(), ".... Completed iteration:", iter, "| converge:", @@ -739,8 +757,10 @@ decontX <- function(counts, logfile = logfile, append = TRUE, verbose = verbose) - + } + iter <- iter + 1L + } } diff --git a/man/decontX.Rd b/man/decontX.Rd index 72e9feeb..cf42797b 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -11,6 +11,7 @@ decontX( maxIter = 500, delta = 10, convergence = 0.001, + convergenceStep = 10, logfile = NULL, verbose = TRUE, varGenes = 5000, @@ -24,10 +25,11 @@ decontX( features and columns represent cells.} \item{z}{Integer vector. Cell cluster labels. If NULL, Celda will be used -to reduce the dimensionality of the dataset to 'L' modules, \link[uwot]{umap} -will be used to further reduce the dataset to 2 dimenions and -\link[dbscan]{dbscan} will be used to identify clusters of broad cell types. -Default NULL.} +to reduce the dimensionality of the dataset to 'L' modules, +'\link[uwot]{umap}' from the 'uwot' package will be used to further +reduce the dataset to 2 dimenions and the '\link[dbscan]{dbscan}' +function from the 'dbscan' package will be used to identify clusters +of broad cell types. Default NULL.} \item{batch}{Integer vector. Batch labels for cells. If batch labels are supplied, DecontX is run on cells from each batch separately. @@ -36,7 +38,11 @@ Default NULL.} \item{maxIter}{Integer. Maximum iterations of the EM algorithm. Default 500.} \item{delta}{Numeric. Symmetric Dirichlet concentration parameter -for Theta. Default 10.} +to initialize theta. Default 10.} + +\item{convergence}{Numeric. The EM algorithm will be stopped if the maximum +difference in the contamination estimates between the previous 'convergenceCheck' +is less than this. Default 0.001.} \item{logfile}{Character. Messages will be redirected to a file named `logfile`. If NULL, messages will be printed to stdout. Default NULL.} @@ -44,7 +50,8 @@ for Theta. Default 10.} \item{verbose}{Logical. Whether to print log messages. Default TRUE.} \item{varGenes}{Integer. The number of variable genes to use in -Celda clustering. Variability is calcualted using \link[scran]{modelGeneVar}. +Celda clustering. Variability is calcualted using '\link[scran]{modelGeneVar}' +function from the 'scran' package. Used only when z is not provided. Default 5000.} \item{L}{Integer. Number of modules for Celda clustering. Used to reduce @@ -52,22 +59,26 @@ the dimensionality of the dataset before applying UMAP and dbscan. Used only when z is not provided. Default 50.} \item{dbscanEps}{Numeric. The clustering resolution parameter -used in \link[dbscan]{dbscan} to estimate broad cell clusters. +used in '\link[dbscan]{dbscan}' to estimate broad cell clusters. Used only when z is not provided. Default 1.} \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} + +\item{convergenceCheck}{Integer. Check for convergence every +'convergenceCheck' iterations. Default 10.} } \value{ -List. 'decontX_counts' contains the decontaminated count matrix +'decontX_counts' contains the decontaminated count matrix. 'contamination' contains the per-cell contamination estimates. 'batchEstimates' contains the estimated probability distributions for each batch. 'z' contains the cell cluster labels. 'runParams' -contains a list of arguments used in this function. +contains a list of arguments used in the function call. } \description{ -Identifies contamination in single cell datasets. +Identifies contamination from factors such as ambient RNA +in single cell genomic datasets. } \examples{ s <- simulateContaminatedMatrix() From 4f2a87f48fff60005e307994fef4cd4eefe472c6 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Sat, 4 Jan 2020 22:21:00 -0500 Subject: [PATCH 121/149] Changed covergence check back to every interation but still calculate the log likelihood every 10 iterations. Added message for conversion to sparse matrix --- R/decon.R | 36 +++++++++++++++++++++--------------- man/decontX.Rd | 4 ++-- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/R/decon.R b/R/decon.R index 2e17c987..eb13b0ef 100644 --- a/R/decon.R +++ b/R/decon.R @@ -275,8 +275,8 @@ simulateContaminatedMatrix <- function(C = 300, #' @param convergence Numeric. The EM algorithm will be stopped if the maximum #' difference in the contamination estimates between the previous 'convergenceCheck' #' is less than this. Default 0.001. -#' @param convergenceCheck Integer. Check for convergence every -#' 'convergenceCheck' iterations. Default 10. +#' @param iterLogLik Integer. Calculate log likelihood every 'iterLogLik' +#' iteration. Default 10. #' @param delta Numeric. Symmetric Dirichlet concentration parameter #' to initialize theta. Default 10. #' @param varGenes Integer. The number of variable genes to use in @@ -393,12 +393,18 @@ decontX <- function(counts, logfile = logfile, verbose = verbose) - # Convert to sparse matrix - # After Celda can run on sparse matrix, - # then we can just have this be required - # as input - counts <- as(counts, "dgCMatrix") - + ## Convert to sparse matrix + if (class(counts) != "dgCMatrix") { + .logMessages( + date(), + ".. Converting to sparse matrix", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + counts <- as(counts, "dgCMatrix") + } + ## Empty expression genes won't be used for estimation haveEmptyGenes <- FALSE totalGenes <- nrow(counts) @@ -723,8 +729,14 @@ decontX <- function(counts, # theta = theta # ) + max.divergence <- max(abs(theta.previous - theta)) + if (max.divergence < convergence) { + converged <- TRUE + } + theta.previous <- theta + ## Calculate likelihood and check for convergence - if (iter %% convergenceStep == 0) { + if (iter %% convergenceStep == 0 || converged) { llTemp <- decontXLogLik( counts = counts, @@ -743,12 +755,6 @@ decontX <- function(counts, # numIterWithoutImprovement <- numIterWithoutImprovement + 1L # } - max.divergence <- max(abs(theta.previous - theta)) - if (max.divergence < convergence) { - converged <- TRUE - } - theta.previous <- theta - .logMessages(date(), ".... Completed iteration:", iter, diff --git a/man/decontX.Rd b/man/decontX.Rd index cf42797b..c11bff64 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -66,8 +66,8 @@ Used only when z is not provided. Default 1.} a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} -\item{convergenceCheck}{Integer. Check for convergence every -'convergenceCheck' iterations. Default 10.} +\item{iterLogLik}{Integer. Every iteration to calculate log likelihood +Default 10.} } \value{ 'decontX_counts' contains the decontaminated count matrix. From a2c0f1c14f92718803929e26e2b183cfe8f0f58b Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Sat, 4 Jan 2020 22:30:51 -0500 Subject: [PATCH 122/149] Updated name of batch when no 'batch' variable is supplied --- R/decon.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/decon.R b/R/decon.R index eb13b0ef..9f6a5384 100644 --- a/R/decon.R +++ b/R/decon.R @@ -357,7 +357,6 @@ decontX <- function(counts, .decontX <- function(counts, z = NULL, - batch = NULL, maxIter = 200, delta = 10, convergence = 0.001, @@ -394,7 +393,7 @@ decontX <- function(counts, verbose = verbose) ## Convert to sparse matrix - if (class(counts) != "dgCMatrix") { + if (!inherits(counts, "dgCMatrix")) { .logMessages( date(), ".. Converting to sparse matrix", @@ -430,8 +429,9 @@ decontX <- function(counts, ## Generate batch labels if none were supplied if (is.null(batch)) { - batch <- rep("all", nC) + batch <- rep(1, nC) } + runParams$batch <- batch batchIndex <- unique(batch) ## Set result lists upfront for all cells from different batches From 12cc55dfde6a7c71af1c27204f7b857cb7494a59 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Sat, 4 Jan 2020 22:35:10 -0500 Subject: [PATCH 123/149] Added back missing batch argument --- R/decon.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/decon.R b/R/decon.R index 9f6a5384..008d7cac 100644 --- a/R/decon.R +++ b/R/decon.R @@ -357,6 +357,7 @@ decontX <- function(counts, .decontX <- function(counts, z = NULL, + batch = NULL, maxIter = 200, delta = 10, convergence = 0.001, From 586598048204345cbf54b1de23ecbbf0c07d21cb Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Sat, 4 Jan 2020 22:53:22 -0500 Subject: [PATCH 124/149] Removed code that filters out genes with all zeros as it is not nessissary now that we are using sparse matrices --- R/decon.R | 10 +--------- src/DecontX.cpp | 4 ++-- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/R/decon.R b/R/decon.R index 008d7cac..4c28460d 100644 --- a/R/decon.R +++ b/R/decon.R @@ -405,17 +405,9 @@ decontX <- function(counts, counts <- as(counts, "dgCMatrix") } - ## Empty expression genes won't be used for estimation - haveEmptyGenes <- FALSE totalGenes <- nrow(counts) totalCells <- ncol(counts) - noneEmptyGeneIndex <- Matrix::rowSums(counts) != 0 geneNames <- rownames(counts) - if (sum(noneEmptyGeneIndex) != totalGenes) { - counts <- counts[noneEmptyGeneIndex, ] - haveEmptyGenes <- TRUE - } - nC <- ncol(counts) allCellNames <- colnames(counts) @@ -488,7 +480,7 @@ decontX <- function(counts, native_counts = estRmat, theta = res$theta, eta = res$eta, - row_index = which(noneEmptyGeneIndex), + row_index = seq(nrow(counts)), col_index = which(batch == bat), phi = res$phi, z = as.integer(res$z), diff --git a/src/DecontX.cpp b/src/DecontX.cpp index c6f9cebe..c339c23a 100644 --- a/src/DecontX.cpp +++ b/src/DecontX.cpp @@ -42,8 +42,8 @@ Rcpp::List decontXEM(const arma::sp_mat& counts, NumericMatrix new_phi(phi.nrow(), phi.ncol()); NumericMatrix new_eta(eta.nrow(), eta.ncol()); - std::fill(new_phi.begin(), new_phi.end(), pseudocount); - std::fill(new_eta.begin(), new_eta.end(), pseudocount); +// std::fill(new_phi.begin(), new_phi.end(), pseudocount); +// std::fill(new_eta.begin(), new_eta.end(), pseudocount); // Obtaining 'fit_dirichlet' function from MCMCprecision package Environment pkg = Environment::namespace_env("MCMCprecision"); From daec32e1d9ef9f76c5d6a94b4a6d30a138e4743a Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Sat, 4 Jan 2020 23:13:14 -0500 Subject: [PATCH 125/149] finished renaming 'convergenceStep' to 'iterLogLik' --- R/decon.R | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/R/decon.R b/R/decon.R index 4c28460d..c4d2fb81 100644 --- a/R/decon.R +++ b/R/decon.R @@ -314,7 +314,7 @@ decontX <- function(counts, maxIter = 500, delta = 10, convergence = 0.001, - convergenceStep = 10, + iterLogLik = 10, logfile = NULL, verbose = TRUE, varGenes = 5000, @@ -329,21 +329,21 @@ decontX <- function(counts, maxIter = maxIter, delta = delta, convergence = convergence, - convergenceStep = convergenceStep, + iterLogLik = iterLogLik, logfile = logfile, verbose = verbose, varGenes = varGenes, L = L, dbscanEps = dbscanEps) } else { - with_seed(seed, + withr::with_seed(seed, res <- .decontX(counts = counts, z = z, batch = batch, maxIter = maxIter, delta = delta, convergence = convergence, - convergenceStep = convergenceStep, + iterLogLik = iterLogLik, logfile = logfile, verbose = verbose, varGenes = varGenes, @@ -361,7 +361,7 @@ decontX <- function(counts, maxIter = 200, delta = 10, convergence = 0.001, - convergenceStep = 10, + iterLogLik = 10, logfile = NULL, verbose = TRUE, varGenes = NULL, @@ -467,7 +467,7 @@ decontX <- function(counts, maxIter = maxIter, delta = delta, convergence = convergence, - convergenceStep = convergenceStep, + iterLogLik = iterLogLik, logfile = logfile, verbose = verbose, varGenes = varGenes, @@ -585,7 +585,7 @@ decontX <- function(counts, maxIter = 200, delta = 10, convergence = 0.01, - convergenceStep = 10, + iterLogLik = 10, logfile = NULL, verbose = TRUE, varGenes = NULL, @@ -649,6 +649,7 @@ decontX <- function(counts, theta <- stats::rbeta(n = nC, shape1 = deltaInit, shape2 = deltaInit) + nextDecon <- decontXInitialize( counts = counts, @@ -729,7 +730,7 @@ decontX <- function(counts, theta.previous <- theta ## Calculate likelihood and check for convergence - if (iter %% convergenceStep == 0 || converged) { + if (iter %% iterLogLik == 0 || converged) { llTemp <- decontXLogLik( counts = counts, From af08451267e92d42c6fe2af26e99d95ab5e25756 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Sat, 4 Jan 2020 23:28:17 -0500 Subject: [PATCH 126/149] Moved with_seed function to wrap around .decontXonebatch so that way different samples/batches will be reprodicible whether or not they are run together or alone --- R/decon.R | 96 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/R/decon.R b/R/decon.R index c4d2fb81..150a71d2 100644 --- a/R/decon.R +++ b/R/decon.R @@ -322,36 +322,21 @@ decontX <- function(counts, dbscanEps = 1, seed = 12345) { - if (is.null(seed)) { - res <- .decontX(counts = counts, - z = z, - batch = batch, - maxIter = maxIter, - delta = delta, - convergence = convergence, - iterLogLik = iterLogLik, - logfile = logfile, - verbose = verbose, - varGenes = varGenes, - L = L, - dbscanEps = dbscanEps) - } else { - withr::with_seed(seed, - res <- .decontX(counts = counts, - z = z, - batch = batch, - maxIter = maxIter, - delta = delta, - convergence = convergence, - iterLogLik = iterLogLik, - logfile = logfile, - verbose = verbose, - varGenes = varGenes, - L = L, - dbscanEps = dbscanEps)) - } - - return(res) + res <- .decontX(counts = counts, + z = z, + batch = batch, + maxIter = maxIter, + delta = delta, + convergence = convergence, + iterLogLik = iterLogLik, + logfile = logfile, + verbose = verbose, + varGenes = varGenes, + L = L, + dbscanEps = dbscanEps, + seed = seed) + + return(res) } @@ -366,7 +351,8 @@ decontX <- function(counts, verbose = TRUE, varGenes = NULL, dbscanEps = NULL, - L = NULL) { + L = NULL, + seed = 12345) { startTime <- Sys.time() .logMessages(paste(rep("-", 50), collapse = ""), @@ -422,7 +408,7 @@ decontX <- function(counts, ## Generate batch labels if none were supplied if (is.null(batch)) { - batch <- rep(1, nC) + batch <- rep("batch", nC) } runParams$batch <- batch batchIndex <- unique(batch) @@ -460,21 +446,37 @@ decontX <- function(counts, if (!is.null(z)) { zBat <- z[batch == bat] } - res <- .decontXoneBatch( - counts = countsBat, - z = zBat, - batch = bat, - maxIter = maxIter, - delta = delta, - convergence = convergence, - iterLogLik = iterLogLik, - logfile = logfile, - verbose = verbose, - varGenes = varGenes, - dbscanEps = dbscanEps, - L = L - ) - + if (is.null(seed)) { + res <- .decontXoneBatch( + counts = countsBat, + z = zBat, + batch = bat, + maxIter = maxIter, + delta = delta, + convergence = convergence, + iterLogLik = iterLogLik, + logfile = logfile, + verbose = verbose, + varGenes = varGenes, + dbscanEps = dbscanEps, + L = L) + } else { + withr::with_seed(seed, + res <- .decontXoneBatch( + counts = countsBat, + z = zBat, + batch = bat, + maxIter = maxIter, + delta = delta, + convergence = convergence, + iterLogLik = iterLogLik, + logfile = logfile, + verbose = verbose, + varGenes = varGenes, + dbscanEps = dbscanEps, + L = L) + ) + } estRmat <- calculateNativeMatrix( counts = countsBat, native_counts = estRmat, From cebb705762f8d2b52e9b574d81b853f6f278d9f6 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Sun, 5 Jan 2020 10:10:07 -0500 Subject: [PATCH 127/149] fix lints --- R/decon.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/decon.R b/R/decon.R index 150a71d2..fa1645ac 100644 --- a/R/decon.R +++ b/R/decon.R @@ -268,7 +268,7 @@ simulateContaminatedMatrix <- function(C = 300, #' reduce the dataset to 2 dimenions and the '\link[dbscan]{dbscan}' #' function from the 'dbscan' package will be used to identify clusters #' of broad cell types. Default NULL. -#' @param batch Integer vector. Batch labels for cells. If batch labels +#' @param batch Integer vector. Batch labels for cells. If batch labels #' are supplied, DecontX is run on cells from each batch separately. #' Default NULL. #' @param maxIter Integer. Maximum iterations of the EM algorithm. Default 500. @@ -281,7 +281,7 @@ simulateContaminatedMatrix <- function(C = 300, #' to initialize theta. Default 10. #' @param varGenes Integer. The number of variable genes to use in #' Celda clustering. Variability is calcualted using '\link[scran]{modelGeneVar}' -#' function from the 'scran' package. +#' function from the 'scran' package. #' Used only when z is not provided. Default 5000. #' @param L Integer. Number of modules for Celda clustering. Used to reduce #' the dimensionality of the dataset before applying UMAP and dbscan. @@ -300,7 +300,7 @@ simulateContaminatedMatrix <- function(C = 300, #' 'batchEstimates' contains the estimated probability distributions #' for each batch. 'z' contains the cell cluster labels. 'runParams' #' contains a list of arguments used in the function call. -#' +#' #' @examples #' s <- simulateContaminatedMatrix() #' result <- decontX(s$observedCounts, s$z) @@ -387,10 +387,10 @@ decontX <- function(counts, logfile = logfile, append = TRUE, verbose = verbose - ) - counts <- as(counts, "dgCMatrix") + ) + counts <- as(counts, "dgCMatrix") } - + totalGenes <- nrow(counts) totalCells <- ncol(counts) geneNames <- rownames(counts) @@ -418,10 +418,10 @@ decontX <- function(counts, estConp <- rep(NA, nC) returnZ <- rep(NA, nC) resBatch <- list() - + ## Cycle through each sample/batch and run DecontX for (bat in batchIndex) { - + if(length(batchIndex) == 1) { .logMessages( date(), From 64f3f47fc9a3e9e596304a236f7b5b7468320a6a Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Tue, 7 Jan 2020 14:12:15 -0500 Subject: [PATCH 128/149] Changed to S4 methods for calling DecontX. Added support for calling from SCE --- R/decon.R | 710 +++++++++++++++++++++++++++++------------------------- 1 file changed, 385 insertions(+), 325 deletions(-) diff --git a/R/decon.R b/R/decon.R index fa1645ac..c3e8ef75 100644 --- a/R/decon.R +++ b/R/decon.R @@ -1,262 +1,3 @@ - - -#' @title Simulate contaminated count matrix -#' @description This function generates a list containing two count matrices -- -#' one for real expression, the other one for contamination, as well as other -#' parameters used in the simulation which can be useful for running -#' decontamination. -#' @param C Integer. Number of cells to be simulated. Default to be 300. -#' @param G Integer. Number of genes to be simulated. Default to be 100. -#' @param K Integer. Number of cell populations to be simulated. Default to be -#' 3. -#' @param NRange Integer vector. A vector of length 2 that specifies the lower -#' and upper bounds of the number of counts generated for each cell. Default to -#' be c(500, 1000). -#' @param beta Numeric. Concentration parameter for Phi. Default to be 0.5. -#' @param delta Numeric or Numeric vector. Concentration parameter for Theta. If -#' input as a single numeric value, symmetric values for beta distribution are -#' specified; if input as a vector of lenght 2, the two values will be the -#' shape1 and shape2 paramters of the beta distribution respectively. -#' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, -#' a default value of 12345 is used. If NULL, no calls to -#' \link[withr]{with_seed} are made. -#' @return A list object containing the real expression matrix and contamination -#' expression matrix as well as other parameters used in the simulation. -#' @examples -#' contaminationSim <- simulateContaminatedMatrix(K = 3, delta = c(1, 9)) -#' contaminationSim <- simulateContaminatedMatrix(K = 3, delta = 1) -#' @export -simulateContaminatedMatrix <- function(C = 300, - G = 100, - K = 3, - NRange = c(500, 1000), - beta = 0.5, - delta = c(1, 2), - seed = 12345) { - - if (is.null(seed)) { - res <- .simulateContaminatedMatrix(C = C, - G = G, - K = K, - NRange = NRange, - beta = beta, - delta = delta) - } else { - with_seed(seed, - res <- .simulateContaminatedMatrix(C = C, - G = G, - K = K, - NRange = NRange, - beta = beta, - delta = delta)) - } - - return(res) -} - - -.simulateContaminatedMatrix <- function(C = 300, - G = 100, - K = 3, - NRange = c(500, 1000), - beta = 0.5, - delta = c(1, 2)) { - if (length(delta) == 1) { - cpByC <- stats::rbeta(n = C, - shape1 = delta, - shape2 = delta) - } else { - cpByC <- stats::rbeta(n = C, - shape1 = delta[1], - shape2 = delta[2]) - } - - z <- sample(seq(K), size = C, replace = TRUE) - if (length(unique(z)) < K) { - warning( - "Only ", - length(unique(z)), - " clusters are simulated. Try to increase numebr of cells 'C' if", - " more clusters are needed" - ) - K <- length(unique(z)) - z <- plyr::mapvalues(z, unique(z), seq(length(unique(z)))) - } - - NbyC <- sample(seq(min(NRange), max(NRange)), - size = C, - replace = TRUE) - cNbyC <- vapply(seq(C), function(i) { - stats::rbinom(n = 1, - size = NbyC[i], - p = cpByC[i]) - }, integer(1)) - rNbyC <- NbyC - cNbyC - - phi <- .rdirichlet(K, rep(beta, G)) - - ## sample real expressed count matrix - cellRmat <- vapply(seq(C), function(i) { - stats::rmultinom(1, size = rNbyC[i], prob = phi[z[i], ]) - }, integer(G)) - - rownames(cellRmat) <- paste0("Gene_", seq(G)) - colnames(cellRmat) <- paste0("Cell_", seq(C)) - - ## sample contamination count matrix - nGByK <- - rowSums(cellRmat) - .colSumByGroup(cellRmat, group = z, K = K) - eta <- normalizeCounts(counts = nGByK, normalize = "proportion") - - cellCmat <- vapply(seq(C), function(i) { - stats::rmultinom(1, size = cNbyC[i], prob = eta[, z[i]]) - }, integer(G)) - cellOmat <- cellRmat + cellCmat - - rownames(cellOmat) <- paste0("Gene_", seq(G)) - colnames(cellOmat) <- paste0("Cell_", seq(C)) - - return( - list( - "nativeCounts" = cellRmat, - "observedCounts" = cellOmat, - "NByC" = NbyC, - "z" = z, - "eta" = eta, - "phi" = t(phi) - ) - ) -} - - -# This function calculates the log-likelihood -# -# counts Numeric/Integer matrix. Observed count matrix, rows represent features -# and columns represent cells -# z Integer vector. Cell population labels -# phi Numeric matrix. Rows represent features and columns represent cell -# populations -# eta Numeric matrix. Rows represent features and columns represent cell -# populations -# theta Numeric vector. Proportion of truely expressed transcripts -.deconCalcLL <- function(counts, z, phi, eta, theta) { - # ll = sum( t(counts) * log( (1-conP )*geneDist[z,] + conP * conDist[z, ] + - # 1e-20 ) ) # when dist_mat are K x G matrices - ll <- sum(Matrix::t(counts) * log(theta * t(phi)[z, ] + - (1 - theta) * t(eta)[z, ] + 1e-20)) - return(ll) -} - -# DEPRECATED. This is not used, but is kept as it might be useful in the future. -# This function calculates the log-likelihood of background distribution -# decontamination -# bgDist Numeric matrix. Rows represent feature and columns are the times that -# the background-distribution has been replicated. -.bgCalcLL <- function(counts, globalZ, cbZ, phi, eta, theta) { - # ll <- sum(t(counts) * log(theta * t(cellDist) + - # (1 - theta) * t(bgDist) + 1e-20)) - ll <- sum(t(counts) * log(theta * t(phi)[cbZ, ] + - (1 - theta) * t(eta)[globalZ, ] + 1e-20)) - return(ll) -} - - -# This function updates decontamination -# phi Numeric matrix. Rows represent features and columns represent cell -# populations -# eta Numeric matrix. Rows represent features and columns represent cell -# populations -# theta Numeric vector. Proportion of truely expressed transctripts -#' @importFrom MCMCprecision fit_dirichlet -.cDCalcEMDecontamination <- function(counts, - phi, - eta, - theta, - z, - K, - delta) { - ## Notes: use fix-point iteration to update prior for theta, no need - ## to feed delta anymore - - logPr <- log(t(phi)[z, ] + 1e-20) + log(theta + 1e-20) - logPc <- log(t(eta)[z, ] + 1e-20) + log(1 - theta + 1e-20) - Pr.e <- exp(logPr) - Pc.e <- exp(logPc) - Pr <- Pr.e / (Pr.e + Pc.e) - - estRmat <- t(Pr) * counts - rnGByK <- .colSumByGroupNumeric(estRmat, z, K) - cnGByK <- rowSums(rnGByK) - rnGByK - - counts.cs <- colSums(counts) - estRmat.cs <- colSums(estRmat) - estRmat.cs.n <- estRmat.cs / counts.cs - estCmat.cs.n <- 1 - estRmat.cs.n - temp <- cbind(estRmat.cs.n, estCmat.cs.n) - deltaV2 <- MCMCprecision::fit_dirichlet(temp)$alpha - - ## Update parameters - theta <- - (estRmat.cs + deltaV2[1]) / (counts.cs + sum(deltaV2)) - phi <- normalizeCounts(rnGByK, - normalize = "proportion", - pseudocountNormalize = 1e-20) - eta <- normalizeCounts(cnGByK, - normalize = "proportion", - pseudocountNormalize = 1e-20) - - return(list( - "estRmat" = estRmat, - "theta" = theta, - "phi" = phi, - "eta" = eta, - "delta" = deltaV2 - )) -} - -# DEPRECATED. This is not used, but is kept as it might be useful in the -# feature. -# This function updates decontamination using background distribution -.cDCalcEMbgDecontamination <- - function(counts, globalZ, cbZ, trZ, phi, eta, theta) { - logPr <- log(t(phi)[cbZ, ] + 1e-20) + log(theta + 1e-20) - logPc <- - log(t(eta)[globalZ, ] + 1e-20) + log(1 - theta + 1e-20) - - Pr <- exp(logPr) / (exp(logPr) + exp(logPc)) - Pc <- 1 - Pr - deltaV2 <- - MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha - - estRmat <- t(Pr) * counts - phiUnnormalized <- - .colSumByGroupNumeric(estRmat, cbZ, max(cbZ)) - etaUnnormalized <- - rowSums(phiUnnormalized) - .colSumByGroupNumeric(phiUnnormalized, - trZ, max(trZ)) - - ## Update paramters - theta <- - (colSums(estRmat) + deltaV2[1]) / (colSums(counts) + sum(deltaV2)) - phi <- - normalizeCounts(phiUnnormalized, - normalize = "proportion", - pseudocountNormalize = 1e-20) - eta <- - normalizeCounts(etaUnnormalized, - normalize = "proportion", - pseudocountNormalize = 1e-20) - - return(list( - "estRmat" = estRmat, - "theta" = theta, - "phi" = phi, - "eta" = eta, - "delta" = deltaV2 - )) - } - - #' @title DecontX #' @description Identifies contamination from factors such as ambient RNA #' in single cell genomic datasets. @@ -307,38 +48,88 @@ simulateContaminatedMatrix <- function(C = 300, #' contamination <- colSums(s$observedCounts - s$nativeCounts) / #' colSums(s$observedCounts) #' plot(contamination, result$contamination) +NULL + #' @export -decontX <- function(counts, - z = NULL, - batch = NULL, - maxIter = 500, - delta = 10, - convergence = 0.001, - iterLogLik = 10, - logfile = NULL, - verbose = TRUE, - varGenes = 5000, - L = 50, - dbscanEps = 1, - seed = 12345) { +setGeneric("decontX", function(x, ...) standardGeneric("decontX")) - res <- .decontX(counts = counts, - z = z, - batch = batch, - maxIter = maxIter, - delta = delta, - convergence = convergence, - iterLogLik = iterLogLik, - logfile = logfile, - verbose = verbose, - varGenes = varGenes, - L = L, - dbscanEps = dbscanEps, - seed = seed) - - return(res) + +######################### +# Setting up S4 methods # +######################### + +#' @export +#' @rdname decontX +setMethod("decontX", "ANY", function(x, ...) { + .decontX(counts=x, ...) +}) + +#' @export +#' @importFrom SummarizedExperiment assay +#' @rdname decontX +setMethod("decontX", "SingleCellExperiment", function(x, ..., assayName="counts") +{ + mat <- SummarizedExperiment::assay(x, i=assayName) + result <- .decontX(mat, ...) + + ## Add results into column annotation + colData(x) = cbind(colData(x), + celda_decontX_Contamination = result$contamination, + celda_decontX_Clusters = result$z) + + ## Add new matrix into assay slot wiht same class as original counts + if(class(mat) == "DelayedMatrix") { + decontXcounts(x) <- DelayedArray(result$decontX_counts) + } else { + SummarizedExperiment::assay(x, "decontXcounts") <- + as(result$decontX_counts, class(mat)) + } + + ## Save the rest of the result object into metadata + result$decontX_counts <- NULL + metadata(x)$decontX <- result + + x +}) + + +## Copied from SingleCellExperiment Package + +GET_FUN <- function(exprs_values, ...) { + (exprs_values) # To ensure evaluation + function(object, ...) { + assay(object, i=exprs_values, ...) + } +} + +SET_FUN <- function(exprs_values, ...) { + (exprs_values) # To ensure evaluation + function(object, ..., value) { + assay(object, i=exprs_values, ...) <- value + object + } } +#' @export +setGeneric("decontXcounts", function(object, ...) standardGeneric("decontXcounts")) + +#' @export +setGeneric("decontXcounts<-", function(object, ..., value) standardGeneric("decontXcounts<-")) + +#' @export +setMethod("decontXcounts", "SingleCellExperiment", GET_FUN("decontXcounts")) + +#' @export +setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("decontXcounts")) + + + + + + +########################## +# Core Decontx Functions # +########################## .decontX <- function(counts, z = NULL, @@ -734,55 +525,189 @@ decontX <- function(counts, ## Calculate likelihood and check for convergence if (iter %% iterLogLik == 0 || converged) { - llTemp <- decontXLogLik( - counts = counts, - z = z, - phi = phi, - eta = eta, - theta = theta, - pseudocount = 1e-20) + llTemp <- decontXLogLik( + counts = counts, + z = z, + phi = phi, + eta = eta, + theta = theta, + pseudocount = 1e-20) + + ll <- c(ll, llTemp) +# llRound <- c(llRound, round(llTemp, 2)) + +# if (round(llTemp, 2) > llRound[iter] | iter == 1) { +# numIterWithoutImprovement <- 1L +# } else { +# numIterWithoutImprovement <- numIterWithoutImprovement + 1L +# } + + .logMessages(date(), + ".... Completed iteration:", + iter, + "| converge:", + signif(max.divergence, 4), + logfile = logfile, + append = TRUE, + verbose = verbose) + } + + iter <- iter + 1L + + } + } + +# resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) + resConp <- nextDecon$contamination + names(resConp) <- colnames(counts) + + return(list( + "logLikelihood" = ll, + "contamination" = resConp, + "theta" = theta, + "delta" = delta, + "phi" = phi, + "eta" = eta, + "UMAP" = umap, + "iteration" = iter - 1L, + "z" = z + )) +} + + + + + +# This function calculates the log-likelihood +# +# counts Numeric/Integer matrix. Observed count matrix, rows represent features +# and columns represent cells +# z Integer vector. Cell population labels +# phi Numeric matrix. Rows represent features and columns represent cell +# populations +# eta Numeric matrix. Rows represent features and columns represent cell +# populations +# theta Numeric vector. Proportion of truely expressed transcripts +.deconCalcLL <- function(counts, z, phi, eta, theta) { + # ll = sum( t(counts) * log( (1-conP )*geneDist[z,] + conP * conDist[z, ] + + # 1e-20 ) ) # when dist_mat are K x G matrices + ll <- sum(Matrix::t(counts) * log(theta * t(phi)[z, ] + + (1 - theta) * t(eta)[z, ] + 1e-20)) + return(ll) +} + +# DEPRECATED. This is not used, but is kept as it might be useful in the future. +# This function calculates the log-likelihood of background distribution +# decontamination +# bgDist Numeric matrix. Rows represent feature and columns are the times that +# the background-distribution has been replicated. +.bgCalcLL <- function(counts, globalZ, cbZ, phi, eta, theta) { + # ll <- sum(t(counts) * log(theta * t(cellDist) + + # (1 - theta) * t(bgDist) + 1e-20)) + ll <- sum(t(counts) * log(theta * t(phi)[cbZ, ] + + (1 - theta) * t(eta)[globalZ, ] + 1e-20)) + return(ll) +} + + +# This function updates decontamination +# phi Numeric matrix. Rows represent features and columns represent cell +# populations +# eta Numeric matrix. Rows represent features and columns represent cell +# populations +# theta Numeric vector. Proportion of truely expressed transctripts +#' @importFrom MCMCprecision fit_dirichlet +.cDCalcEMDecontamination <- function(counts, + phi, + eta, + theta, + z, + K, + delta) { + ## Notes: use fix-point iteration to update prior for theta, no need + ## to feed delta anymore + + logPr <- log(t(phi)[z, ] + 1e-20) + log(theta + 1e-20) + logPc <- log(t(eta)[z, ] + 1e-20) + log(1 - theta + 1e-20) + Pr.e <- exp(logPr) + Pc.e <- exp(logPc) + Pr <- Pr.e / (Pr.e + Pc.e) - ll <- c(ll, llTemp) -# llRound <- c(llRound, round(llTemp, 2)) + estRmat <- t(Pr) * counts + rnGByK <- .colSumByGroupNumeric(estRmat, z, K) + cnGByK <- rowSums(rnGByK) - rnGByK -# if (round(llTemp, 2) > llRound[iter] | iter == 1) { -# numIterWithoutImprovement <- 1L -# } else { -# numIterWithoutImprovement <- numIterWithoutImprovement + 1L -# } - - .logMessages(date(), - ".... Completed iteration:", - iter, - "| converge:", - signif(max.divergence, 4), - logfile = logfile, - append = TRUE, - verbose = verbose) - } - - iter <- iter + 1L - - } - } + counts.cs <- colSums(counts) + estRmat.cs <- colSums(estRmat) + estRmat.cs.n <- estRmat.cs / counts.cs + estCmat.cs.n <- 1 - estRmat.cs.n + temp <- cbind(estRmat.cs.n, estCmat.cs.n) + deltaV2 <- MCMCprecision::fit_dirichlet(temp)$alpha -# resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) - resConp <- nextDecon$contamination - names(resConp) <- colnames(counts) + ## Update parameters + theta <- + (estRmat.cs + deltaV2[1]) / (counts.cs + sum(deltaV2)) + phi <- normalizeCounts(rnGByK, + normalize = "proportion", + pseudocountNormalize = 1e-20) + eta <- normalizeCounts(cnGByK, + normalize = "proportion", + pseudocountNormalize = 1e-20) return(list( - "logLikelihood" = ll, - "contamination" = resConp, + "estRmat" = estRmat, "theta" = theta, - "delta" = delta, "phi" = phi, "eta" = eta, - "UMAP" = umap, - "iteration" = iter - 1L, - "z" = z + "delta" = deltaV2 )) } +# DEPRECATED. This is not used, but is kept as it might be useful in the +# feature. +# This function updates decontamination using background distribution +.cDCalcEMbgDecontamination <- + function(counts, globalZ, cbZ, trZ, phi, eta, theta) { + logPr <- log(t(phi)[cbZ, ] + 1e-20) + log(theta + 1e-20) + logPc <- + log(t(eta)[globalZ, ] + 1e-20) + log(1 - theta + 1e-20) + + Pr <- exp(logPr) / (exp(logPr) + exp(logPc)) + Pc <- 1 - Pr + deltaV2 <- + MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha + + estRmat <- t(Pr) * counts + phiUnnormalized <- + .colSumByGroupNumeric(estRmat, cbZ, max(cbZ)) + etaUnnormalized <- + rowSums(phiUnnormalized) - .colSumByGroupNumeric(phiUnnormalized, + trZ, max(trZ)) + + ## Update paramters + theta <- + (colSums(estRmat) + deltaV2[1]) / (colSums(counts) + sum(deltaV2)) + phi <- + normalizeCounts(phiUnnormalized, + normalize = "proportion", + pseudocountNormalize = 1e-20) + eta <- + normalizeCounts(etaUnnormalized, + normalize = "proportion", + pseudocountNormalize = 1e-20) + + return(list( + "estRmat" = estRmat, + "theta" = theta, + "phi" = phi, + "eta" = eta, + "delta" = deltaV2 + )) +} + + + + ## Make sure provided parameters are the right type and value range .checkParametersDecon <- function(proportionPrior) { @@ -973,3 +898,138 @@ addLogLikelihood <- function(llA, llB) { } return(L) } + + + +######################### +# Simulating Data # +######################### + +#' @title Simulate contaminated count matrix +#' @description This function generates a list containing two count matrices -- +#' one for real expression, the other one for contamination, as well as other +#' parameters used in the simulation which can be useful for running +#' decontamination. +#' @param C Integer. Number of cells to be simulated. Default to be 300. +#' @param G Integer. Number of genes to be simulated. Default to be 100. +#' @param K Integer. Number of cell populations to be simulated. Default to be +#' 3. +#' @param NRange Integer vector. A vector of length 2 that specifies the lower +#' and upper bounds of the number of counts generated for each cell. Default to +#' be c(500, 1000). +#' @param beta Numeric. Concentration parameter for Phi. Default to be 0.5. +#' @param delta Numeric or Numeric vector. Concentration parameter for Theta. If +#' input as a single numeric value, symmetric values for beta distribution are +#' specified; if input as a vector of lenght 2, the two values will be the +#' shape1 and shape2 paramters of the beta distribution respectively. +#' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, +#' a default value of 12345 is used. If NULL, no calls to +#' \link[withr]{with_seed} are made. +#' @return A list object containing the real expression matrix and contamination +#' expression matrix as well as other parameters used in the simulation. +#' @examples +#' contaminationSim <- simulateContaminatedMatrix(K = 3, delta = c(1, 9)) +#' contaminationSim <- simulateContaminatedMatrix(K = 3, delta = 1) +#' @export +simulateContaminatedMatrix <- function(C = 300, + G = 100, + K = 3, + NRange = c(500, 1000), + beta = 0.5, + delta = c(1, 2), + seed = 12345) { + + if (is.null(seed)) { + res <- .simulateContaminatedMatrix(C = C, + G = G, + K = K, + NRange = NRange, + beta = beta, + delta = delta) + } else { + with_seed(seed, + res <- .simulateContaminatedMatrix(C = C, + G = G, + K = K, + NRange = NRange, + beta = beta, + delta = delta)) + } + + return(res) +} + + +.simulateContaminatedMatrix <- function(C = 300, + G = 100, + K = 3, + NRange = c(500, 1000), + beta = 0.5, + delta = c(1, 2)) { + if (length(delta) == 1) { + cpByC <- stats::rbeta(n = C, + shape1 = delta, + shape2 = delta) + } else { + cpByC <- stats::rbeta(n = C, + shape1 = delta[1], + shape2 = delta[2]) + } + + z <- sample(seq(K), size = C, replace = TRUE) + if (length(unique(z)) < K) { + warning( + "Only ", + length(unique(z)), + " clusters are simulated. Try to increase numebr of cells 'C' if", + " more clusters are needed" + ) + K <- length(unique(z)) + z <- plyr::mapvalues(z, unique(z), seq(length(unique(z)))) + } + + NbyC <- sample(seq(min(NRange), max(NRange)), + size = C, + replace = TRUE) + cNbyC <- vapply(seq(C), function(i) { + stats::rbinom(n = 1, + size = NbyC[i], + p = cpByC[i]) + }, integer(1)) + rNbyC <- NbyC - cNbyC + + phi <- .rdirichlet(K, rep(beta, G)) + + ## sample real expressed count matrix + cellRmat <- vapply(seq(C), function(i) { + stats::rmultinom(1, size = rNbyC[i], prob = phi[z[i], ]) + }, integer(G)) + + rownames(cellRmat) <- paste0("Gene_", seq(G)) + colnames(cellRmat) <- paste0("Cell_", seq(C)) + + ## sample contamination count matrix + nGByK <- + rowSums(cellRmat) - .colSumByGroup(cellRmat, group = z, K = K) + eta <- normalizeCounts(counts = nGByK, normalize = "proportion") + + cellCmat <- vapply(seq(C), function(i) { + stats::rmultinom(1, size = cNbyC[i], prob = eta[, z[i]]) + }, integer(G)) + cellOmat <- cellRmat + cellCmat + + rownames(cellOmat) <- paste0("Gene_", seq(G)) + colnames(cellOmat) <- paste0("Cell_", seq(C)) + + return( + list( + "nativeCounts" = cellRmat, + "observedCounts" = cellOmat, + "NByC" = NbyC, + "z" = z, + "eta" = eta, + "phi" = t(phi) + ) + ) +} + From 3f6b21af620005e85d32d78433064d1174012a53 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Tue, 7 Jan 2020 15:30:51 -0500 Subject: [PATCH 129/149] Moved conversion to dgCMatrix to within each batch --- R/decon.R | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/R/decon.R b/R/decon.R index c3e8ef75..a133fbdb 100644 --- a/R/decon.R +++ b/R/decon.R @@ -61,7 +61,7 @@ setGeneric("decontX", function(x, ...) standardGeneric("decontX")) #' @export #' @rdname decontX setMethod("decontX", "ANY", function(x, ...) { - .decontX(counts=x, ...) + .decontX(counts = x, ...) }) #' @export @@ -170,18 +170,6 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec logfile = logfile, verbose = verbose) - ## Convert to sparse matrix - if (!inherits(counts, "dgCMatrix")) { - .logMessages( - date(), - ".. Converting to sparse matrix", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - counts <- as(counts, "dgCMatrix") - } - totalGenes <- nrow(counts) totalCells <- ncol(counts) geneNames <- rownames(counts) @@ -234,6 +222,20 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec zBat <- NULL countsBat <- counts[, batch == bat] + + ## Convert to sparse matrix + if (!inherits(countsBat, "dgCMatrix")) { + .logMessages( + date(), + ".... Converting to sparse matrix", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + countsBat <- as(countsBat, "dgCMatrix") + } + + if (!is.null(z)) { zBat <- z[batch == bat] } From a276e9c84c147c12f496d9b4f8b9590554986a82 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Tue, 7 Jan 2020 15:39:07 -0500 Subject: [PATCH 130/149] Updated spacing in messages. Changed variable names for batchEstimates to estimates --- R/decon.R | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/R/decon.R b/R/decon.R index a133fbdb..f76943a0 100644 --- a/R/decon.R +++ b/R/decon.R @@ -36,9 +36,9 @@ #' @param logfile Character. Messages will be redirected to a file named #' `logfile`. If NULL, messages will be printed to stdout. Default NULL. #' @param verbose Logical. Whether to print log messages. Default TRUE. -#' @return 'decontX_counts' contains the decontaminated count matrix. +#' @return 'decontXcounts' contains the decontaminated count matrix. #' 'contamination' contains the per-cell contamination estimates. -#' 'batchEstimates' contains the estimated probability distributions +#' 'estimates' contains the estimated probability distributions #' for each batch. 'z' contains the cell cluster labels. 'runParams' #' contains a list of arguments used in the function call. #' @@ -79,14 +79,14 @@ setMethod("decontX", "SingleCellExperiment", function(x, ..., assayName="counts" ## Add new matrix into assay slot wiht same class as original counts if(class(mat) == "DelayedMatrix") { - decontXcounts(x) <- DelayedArray(result$decontX_counts) + decontXcounts(x) <- DelayedArray(result$decontXcounts) } else { SummarizedExperiment::assay(x, "decontXcounts") <- - as(result$decontX_counts, class(mat)) + as(result$decontXcounts, class(mat)) } ## Save the rest of the result object into metadata - result$decontX_counts <- NULL + result$decontXcounts <- NULL metadata(x)$decontX <- result x @@ -187,7 +187,7 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec ## Generate batch labels if none were supplied if (is.null(batch)) { - batch <- rep("batch", nC) + batch <- rep("all_cells", nC) } runParams$batch <- batch batchIndex <- unique(batch) @@ -212,8 +212,8 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec } else { .logMessages( date(), - ".. Analyzing cells in batch", - bat, + ".. Analyzing cells in batch '", + bat, "'", logfile = logfile, append = TRUE, verbose = verbose @@ -318,8 +318,8 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec returnResult <- list( "runParams" = runParams, - "batchEstimates" = resBatch, - "decontX_counts" = estRmat, + "estimates" = resBatch, + "decontXcounts" = estRmat, "contamination" = estConp, "z" = returnZ ) @@ -398,7 +398,7 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec if (is.null(z)) { .logMessages( date(), - ".. Estimating cell types with Celda", + ".... Estimating cell types with Celda", logfile = logfile, append = TRUE, verbose = verbose @@ -431,7 +431,7 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec .logMessages( date(), - ".. Estimating contamination", + ".... Estimating contamination", logfile = logfile, append = TRUE, verbose = verbose @@ -545,7 +545,7 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec # } .logMessages(date(), - ".... Completed iteration:", + "...... Completed iteration:", iter, "| converge:", signif(max.divergence, 4), @@ -809,7 +809,7 @@ addLogLikelihood <- function(llA, llB) { .logMessages( date(), - ".... Collapsing features into", + "...... Collapsing features into", L, "modules", logfile = logfile, @@ -831,7 +831,7 @@ addLogLikelihood <- function(llA, llB) { .logMessages( date(), - ".... Reducing dimensionality with UMAP", + "...... Reducing dimensionality with UMAP", logfile = logfile, append = TRUE, verbose = verbose @@ -844,7 +844,7 @@ addLogLikelihood <- function(llA, llB) { .logMessages( date(), - " .... Determining cell clusters with DBSCAN (Eps=", + " ...... Determining cell clusters with DBSCAN (Eps=", dbscanEps, ")", sep = "", From a9d130affd8e245cf1e97264f0acd4bac39d5de4 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Tue, 7 Jan 2020 16:45:50 -0500 Subject: [PATCH 131/149] Updated decontX documentation --- NAMESPACE | 5 ++++ R/decon.R | 49 +++++++++++++++++++++----------- man/decontX.Rd | 76 ++++++++++++++++++++++++++------------------------ 3 files changed, 77 insertions(+), 53 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 401ad844..b0030709 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export("decontXcounts<-") export(appendCeldaList) export(availableModels) export(bestLogLikelihood) @@ -18,6 +19,7 @@ export(clusters) export(compareCountMatrix) export(countChecksum) export(decontX) +export(decontXcounts) export(differentialExpression) export(distinctColors) export(factorizeMatrix) @@ -63,6 +65,7 @@ export(simulateContaminatedMatrix) export(subsetCeldaList) export(topRank) export(violinPlot) +exportMethods("decontXcounts<-") exportMethods(bestLogLikelihood) exportMethods(celdaHeatmap) exportMethods(celdaPerplexity) @@ -72,6 +75,8 @@ exportMethods(celdaUmap) exportMethods(clusterProbability) exportMethods(clusters) exportMethods(countChecksum) +exportMethods(decontX) +exportMethods(decontXcounts) exportMethods(factorizeMatrix) exportMethods(featureModuleLookup) exportMethods(logLikelihoodHistory) diff --git a/R/decon.R b/R/decon.R index f76943a0..d8d96ab0 100644 --- a/R/decon.R +++ b/R/decon.R @@ -1,21 +1,26 @@ -#' @title DecontX +#' @title Contamination estimation with decontX +#' #' @description Identifies contamination from factors such as ambient RNA #' in single cell genomic datasets. -#' @param counts Numeric matrix. Observed count matrix, rows represent -#' features and columns represent cells. +#' +#' @name decontX +#' +#' @param x A numeric matrix of counts, or a \linkS4class{SingleCellExperiment} +#' containing such a matrix in the 'assayName' assay. #' @param z Integer vector. Cell cluster labels. If NULL, Celda will be used #' to reduce the dimensionality of the dataset to 'L' modules, #' '\link[uwot]{umap}' from the 'uwot' package will be used to further #' reduce the dataset to 2 dimenions and the '\link[dbscan]{dbscan}' #' function from the 'dbscan' package will be used to identify clusters #' of broad cell types. Default NULL. -#' @param batch Integer vector. Batch labels for cells. If batch labels -#' are supplied, DecontX is run on cells from each batch separately. -#' Default NULL. +#' @param batch Numeric or character vector. Batch labels for cells. +#' If batch labels are supplied, DecontX is run on cells from each +#' batch separately. Cells run in different channels or assays +#' should be considered different batches. Default NULL. #' @param maxIter Integer. Maximum iterations of the EM algorithm. Default 500. #' @param convergence Numeric. The EM algorithm will be stopped if the maximum -#' difference in the contamination estimates between the previous 'convergenceCheck' -#' is less than this. Default 0.001. +#' difference in the contamination estimates between the previous and +#' current iterations is less than this. Default 0.001. #' @param iterLogLik Integer. Calculate log likelihood every 'iterLogLik' #' iteration. Default 10. #' @param delta Numeric. Symmetric Dirichlet concentration parameter @@ -36,11 +41,23 @@ #' @param logfile Character. Messages will be redirected to a file named #' `logfile`. If NULL, messages will be printed to stdout. Default NULL. #' @param verbose Logical. Whether to print log messages. Default TRUE. -#' @return 'decontXcounts' contains the decontaminated count matrix. -#' 'contamination' contains the per-cell contamination estimates. -#' 'estimates' contains the estimated probability distributions -#' for each batch. 'z' contains the cell cluster labels. 'runParams' -#' contains a list of arguments used in the function call. +#' +#' @return If \code{x} is a matrix-like object, a list will be returned +#' with the following items: +#' \describe{ +#' \item{\code{decontXcounts}:}{The decontaminated count matrix.} +#' \item{\code{contamination}:}{Percentage of contamination in each cell.} +#' \item{\code{estimates}:}{Estimated probability distributions +#' for each batch.} +#' \item{\code{z}:}{Cell population/cluster labels used for analysis.} +#' \item{\code{runParams}:}{List of arguments used in the function call.} +#' } +#' +#' If \code{x} is a \linkS4class{SingleCellExperiment}, then the decontaminated +#' counts will be stored as an assay and can be accessed with +#' \code{decontXcounts(x)}. The contamination values and cluster labels +#' will be stored in \code{colData(x)}. All other items will be stored +#' in \code{metadata(x)$decontX}. #' #' @examples #' s <- simulateContaminatedMatrix() @@ -65,7 +82,6 @@ setMethod("decontX", "ANY", function(x, ...) { }) #' @export -#' @importFrom SummarizedExperiment assay #' @rdname decontX setMethod("decontX", "SingleCellExperiment", function(x, ..., assayName="counts") { @@ -79,7 +95,7 @@ setMethod("decontX", "SingleCellExperiment", function(x, ..., assayName="counts" ## Add new matrix into assay slot wiht same class as original counts if(class(mat) == "DelayedMatrix") { - decontXcounts(x) <- DelayedArray(result$decontXcounts) + decontXcounts(x) <- DelayedArray::DelayedArray(result$decontXcounts) } else { SummarizedExperiment::assay(x, "decontXcounts") <- as(result$decontXcounts, class(mat)) @@ -212,8 +228,9 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec } else { .logMessages( date(), - ".. Analyzing cells in batch '", + " .. Analyzing cells in batch '", bat, "'", + sep = "", logfile = logfile, append = TRUE, verbose = verbose diff --git a/man/decontX.Rd b/man/decontX.Rd index c11bff64..38b7028f 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -2,27 +2,17 @@ % Please edit documentation in R/decon.R \name{decontX} \alias{decontX} -\title{DecontX} +\alias{decontX,ANY-method} +\alias{decontX,SingleCellExperiment-method} +\title{Contamination estimation with decontX} \usage{ -decontX( - counts, - z = NULL, - batch = NULL, - maxIter = 500, - delta = 10, - convergence = 0.001, - convergenceStep = 10, - logfile = NULL, - verbose = TRUE, - varGenes = 5000, - L = 50, - dbscanEps = 1, - seed = 12345 -) +\S4method{decontX}{ANY}(x, ...) + +\S4method{decontX}{SingleCellExperiment}(x, ..., assayName = "counts") } \arguments{ -\item{counts}{Numeric matrix. Observed count matrix, rows represent -features and columns represent cells.} +\item{x}{A numeric matrix of counts, or a \linkS4class{SingleCellExperiment} +containing such a matrix in the 'assayName' assay.} \item{z}{Integer vector. Cell cluster labels. If NULL, Celda will be used to reduce the dimensionality of the dataset to 'L' modules, @@ -31,27 +21,26 @@ reduce the dataset to 2 dimenions and the '\link[dbscan]{dbscan}' function from the 'dbscan' package will be used to identify clusters of broad cell types. Default NULL.} -\item{batch}{Integer vector. Batch labels for cells. If batch labels -are supplied, DecontX is run on cells from each batch separately. -Default NULL.} +\item{batch}{Numeric or character vector. Batch labels for cells. +If batch labels are supplied, DecontX is run on cells from each +batch separately. Cells run in different channels or assays +should be considered different batches. Default NULL.} \item{maxIter}{Integer. Maximum iterations of the EM algorithm. Default 500.} -\item{delta}{Numeric. Symmetric Dirichlet concentration parameter -to initialize theta. Default 10.} - \item{convergence}{Numeric. The EM algorithm will be stopped if the maximum -difference in the contamination estimates between the previous 'convergenceCheck' -is less than this. Default 0.001.} +difference in the contamination estimates between the previous and +current iterations is less than this. Default 0.001.} -\item{logfile}{Character. Messages will be redirected to a file named -`logfile`. If NULL, messages will be printed to stdout. Default NULL.} +\item{iterLogLik}{Integer. Calculate log likelihood every 'iterLogLik' +iteration. Default 10.} -\item{verbose}{Logical. Whether to print log messages. Default TRUE.} +\item{delta}{Numeric. Symmetric Dirichlet concentration parameter +to initialize theta. Default 10.} \item{varGenes}{Integer. The number of variable genes to use in Celda clustering. Variability is calcualted using '\link[scran]{modelGeneVar}' -function from the 'scran' package. +function from the 'scran' package. Used only when z is not provided. Default 5000.} \item{L}{Integer. Number of modules for Celda clustering. Used to reduce @@ -66,15 +55,28 @@ Used only when z is not provided. Default 1.} a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} -\item{iterLogLik}{Integer. Every iteration to calculate log likelihood -Default 10.} +\item{logfile}{Character. Messages will be redirected to a file named +`logfile`. If NULL, messages will be printed to stdout. Default NULL.} + +\item{verbose}{Logical. Whether to print log messages. Default TRUE.} } \value{ -'decontX_counts' contains the decontaminated count matrix. -'contamination' contains the per-cell contamination estimates. -'batchEstimates' contains the estimated probability distributions -for each batch. 'z' contains the cell cluster labels. 'runParams' -contains a list of arguments used in the function call. +If \code{x} is a matrix-like object, a list will be returned +with the following items: +\describe{ +\item{\code{decontXcounts}:}{The decontaminated count matrix.} +\item{\code{contamination}:}{Percentage of contamination in each cell.} +\item{\code{estimates}:}{Estimated probability distributions +for each batch.} +\item{\code{z}:}{Cell population/cluster labels.} +\item{\code{runParams}:}{List of arguments used in the function call.} +} + +If \code{x} is a \linkS4class{SingleCellExperiment}, then the decontaminated +counts will be stored as an assay and can be accessed with +\code{decontXcounts(x)}. The contamination values and cluster labels +will be stored in \code{colData(x)}. All other items will be stored +in \code{metadata(x)$decontX}. } \description{ Identifies contamination from factors such as ambient RNA From 8bc0d111c3dab2d578e1e73d5e4fe6b8130cd889 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Tue, 7 Jan 2020 17:47:07 -0500 Subject: [PATCH 132/149] Updated decontX docs --- R/decon.R | 100 +++++++++++++++++++++++++++++++++++++------------ man/decontX.Rd | 70 ++++++++++++++++++++++++---------- 2 files changed, 127 insertions(+), 43 deletions(-) diff --git a/R/decon.R b/R/decon.R index d8d96ab0..288c2822 100644 --- a/R/decon.R +++ b/R/decon.R @@ -7,12 +7,14 @@ #' #' @param x A numeric matrix of counts, or a \linkS4class{SingleCellExperiment} #' containing such a matrix in the 'assayName' assay. -#' @param z Integer vector. Cell cluster labels. If NULL, Celda will be used -#' to reduce the dimensionality of the dataset to 'L' modules, -#' '\link[uwot]{umap}' from the 'uwot' package will be used to further -#' reduce the dataset to 2 dimenions and the '\link[dbscan]{dbscan}' -#' function from the 'dbscan' package will be used to identify clusters -#' of broad cell types. Default NULL. +#' @param assayName Character. Name of the assay to use if \code{x} is a +#' \linkS4class{SingleCellExperiment}. +#' @param z Numeric or character vector. Cell cluster labels. If NULL, +#' Celda will be used to reduce the dimensionality of the dataset +#' to 'L' modules, '\link[uwot]{umap}' from the 'uwot' package +#' will be used to further reduce the dataset to 2 dimenions and +#' the '\link[dbscan]{dbscan}' function from the 'dbscan' package +#' will be used to identify clusters of broad cell types. Default NULL. #' @param batch Numeric or character vector. Batch labels for cells. #' If batch labels are supplied, DecontX is run on cells from each #' batch separately. Cells run in different channels or assays @@ -56,8 +58,8 @@ #' If \code{x} is a \linkS4class{SingleCellExperiment}, then the decontaminated #' counts will be stored as an assay and can be accessed with #' \code{decontXcounts(x)}. The contamination values and cluster labels -#' will be stored in \code{colData(x)}. All other items will be stored -#' in \code{metadata(x)$decontX}. +#' will be stored in \code{colData(x)}. \code{estimates} and \code{runParams} +#' will be stored in \code{metadata(x)$decontX}. #' #' @examples #' s <- simulateContaminatedMatrix() @@ -75,23 +77,43 @@ setGeneric("decontX", function(x, ...) standardGeneric("decontX")) # Setting up S4 methods # ######################### -#' @export -#' @rdname decontX -setMethod("decontX", "ANY", function(x, ...) { - .decontX(counts = x, ...) -}) #' @export #' @rdname decontX -setMethod("decontX", "SingleCellExperiment", function(x, ..., assayName="counts") +setMethod("decontX", "SingleCellExperiment", function(x, + assayName="counts", + z = NULL, + batch = NULL, + maxIter = 500, + delta = 10, + convergence = 0.001, + iterLogLik = 10, + varGenes = 5000, + dbscanEps = 1, + L = 50, + seed = 12345, + logfile = NULL, + verbose = TRUE) { mat <- SummarizedExperiment::assay(x, i=assayName) - result <- .decontX(mat, ...) + result <- .decontX(counts = mat, + z = z, + batch = batch, + maxIter = maxIter, + convergence = convergence, + iterLogLik = iterLogLik, + delta = delta, + varGenes = varGenes, + L = L, + dbscanEps = dbscanEps, + seed = seed, + logfile = logfile, + verbose = verbose) ## Add results into column annotation colData(x) = cbind(colData(x), - celda_decontX_Contamination = result$contamination, - celda_decontX_Clusters = result$z) + decontX_Contamination = result$contamination, + decontX_Clusters = result$z) ## Add new matrix into assay slot wiht same class as original counts if(class(mat) == "DelayedMatrix") { @@ -108,6 +130,37 @@ setMethod("decontX", "SingleCellExperiment", function(x, ..., assayName="counts" x }) +#' @export +#' @rdname decontX +setMethod("decontX", "ANY", function(x, + z = NULL, + batch = NULL, + maxIter = 500, + delta = 10, + convergence = 0.001, + iterLogLik = 10, + varGenes = 5000, + dbscanEps = 1, + L = 50, + seed = 12345, + logfile = NULL, + verbose = TRUE) +{ + .decontX(counts = x, + z = z, + batch = batch, + maxIter = maxIter, + convergence = convergence, + iterLogLik = iterLogLik, + delta = delta, + varGenes = varGenes, + L = L, + dbscanEps = dbscanEps, + seed = seed, + logfile = logfile, + verbose = verbose) +}) + ## Copied from SingleCellExperiment Package @@ -141,8 +194,6 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec - - ########################## # Core Decontx Functions # ########################## @@ -151,15 +202,15 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec z = NULL, batch = NULL, maxIter = 200, - delta = 10, convergence = 0.001, iterLogLik = 10, - logfile = NULL, - verbose = TRUE, + delta = 10, varGenes = NULL, + L = NULL, dbscanEps = NULL, - L = NULL, - seed = 12345) { + seed = 12345, + logfile = NULL, + verbose = TRUE) { startTime <- Sys.time() .logMessages(paste(rep("-", 50), collapse = ""), @@ -429,6 +480,7 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec celda.init <- .decontxInitializeZ(object = counts, varGenes = varGenes, + L = L, dbscanEps = dbscanEps, verbose = verbose, logfile = logfile) diff --git a/man/decontX.Rd b/man/decontX.Rd index 38b7028f..58d25b79 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -2,24 +2,56 @@ % Please edit documentation in R/decon.R \name{decontX} \alias{decontX} -\alias{decontX,ANY-method} \alias{decontX,SingleCellExperiment-method} +\alias{decontX,ANY-method} \title{Contamination estimation with decontX} \usage{ -\S4method{decontX}{ANY}(x, ...) +\S4method{decontX}{SingleCellExperiment}( + x, + assayName = "counts", + z = NULL, + batch = NULL, + maxIter = 500, + delta = 10, + convergence = 0.001, + iterLogLik = 10, + varGenes = 5000, + dbscanEps = 1, + L = 50, + seed = 12345, + logfile = NULL, + verbose = TRUE +) -\S4method{decontX}{SingleCellExperiment}(x, ..., assayName = "counts") +\S4method{decontX}{ANY}( + x, + z = NULL, + batch = NULL, + maxIter = 500, + delta = 10, + convergence = 0.001, + iterLogLik = 10, + varGenes = 5000, + dbscanEps = 1, + L = 50, + seed = 12345, + logfile = NULL, + verbose = TRUE +) } \arguments{ \item{x}{A numeric matrix of counts, or a \linkS4class{SingleCellExperiment} containing such a matrix in the 'assayName' assay.} -\item{z}{Integer vector. Cell cluster labels. If NULL, Celda will be used -to reduce the dimensionality of the dataset to 'L' modules, -'\link[uwot]{umap}' from the 'uwot' package will be used to further -reduce the dataset to 2 dimenions and the '\link[dbscan]{dbscan}' -function from the 'dbscan' package will be used to identify clusters -of broad cell types. Default NULL.} +\item{assayName}{Character. Name of the assay to use if \code{x} is a +\linkS4class{SingleCellExperiment}.} + +\item{z}{Numeric or character vector. Cell cluster labels. If NULL, +Celda will be used to reduce the dimensionality of the dataset +to 'L' modules, '\link[uwot]{umap}' from the 'uwot' package +will be used to further reduce the dataset to 2 dimenions and +the '\link[dbscan]{dbscan}' function from the 'dbscan' package +will be used to identify clusters of broad cell types. Default NULL.} \item{batch}{Numeric or character vector. Batch labels for cells. If batch labels are supplied, DecontX is run on cells from each @@ -28,6 +60,9 @@ should be considered different batches. Default NULL.} \item{maxIter}{Integer. Maximum iterations of the EM algorithm. Default 500.} +\item{delta}{Numeric. Symmetric Dirichlet concentration parameter +to initialize theta. Default 10.} + \item{convergence}{Numeric. The EM algorithm will be stopped if the maximum difference in the contamination estimates between the previous and current iterations is less than this. Default 0.001.} @@ -35,22 +70,19 @@ current iterations is less than this. Default 0.001.} \item{iterLogLik}{Integer. Calculate log likelihood every 'iterLogLik' iteration. Default 10.} -\item{delta}{Numeric. Symmetric Dirichlet concentration parameter -to initialize theta. Default 10.} - \item{varGenes}{Integer. The number of variable genes to use in Celda clustering. Variability is calcualted using '\link[scran]{modelGeneVar}' function from the 'scran' package. Used only when z is not provided. Default 5000.} -\item{L}{Integer. Number of modules for Celda clustering. Used to reduce -the dimensionality of the dataset before applying UMAP and dbscan. -Used only when z is not provided. Default 50.} - \item{dbscanEps}{Numeric. The clustering resolution parameter used in '\link[dbscan]{dbscan}' to estimate broad cell clusters. Used only when z is not provided. Default 1.} +\item{L}{Integer. Number of modules for Celda clustering. Used to reduce +the dimensionality of the dataset before applying UMAP and dbscan. +Used only when z is not provided. Default 50.} + \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} @@ -68,15 +100,15 @@ with the following items: \item{\code{contamination}:}{Percentage of contamination in each cell.} \item{\code{estimates}:}{Estimated probability distributions for each batch.} -\item{\code{z}:}{Cell population/cluster labels.} +\item{\code{z}:}{Cell population/cluster labels used for analysis.} \item{\code{runParams}:}{List of arguments used in the function call.} } If \code{x} is a \linkS4class{SingleCellExperiment}, then the decontaminated counts will be stored as an assay and can be accessed with \code{decontXcounts(x)}. The contamination values and cluster labels -will be stored in \code{colData(x)}. All other items will be stored -in \code{metadata(x)$decontX}. +will be stored in \code{colData(x)}. \code{estimates} and \code{runParams} +will be stored in \code{metadata(x)$decontX}. } \description{ Identifies contamination from factors such as ambient RNA From c98c3922d68635306a462ba0b12133ececb0c042 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Tue, 7 Jan 2020 22:08:48 -0500 Subject: [PATCH 133/149] Added UMAP to 'reducedDims' when 'x' is a SingleCellExperiment object --- R/decon.R | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/R/decon.R b/R/decon.R index 288c2822..347a4d22 100644 --- a/R/decon.R +++ b/R/decon.R @@ -123,11 +123,34 @@ setMethod("decontX", "SingleCellExperiment", function(x, as(result$decontXcounts, class(mat)) } + ## Put estimated UMAPs into SCE if z was estimated with Celda/UMAP + if (is.null(result$runParams$z)) { + batchIndex <- unique(result$runParams$batch) + if (length(batchIndex) > 1) { + for(i in batchIndex) { + + ## Each individual UMAP will only be for one batch so need + ## to put NAs in for cells in other batches + tempUMAP <- matrix(NA, ncol = 2, nrow = ncol(mat)) + tempUMAP[result$runParams$batch == i,] <- result$estimates[[i]]$UMAP + colnames(tempUMAP) <- c("UMAP_1", "UMAP_2") + rownames(tempUMAP) <- colnames(mat) + + SingleCellExperiment::reducedDim(x, + paste("decontX", i, "UMAP", sep="_")) <- tempUMAP + } + } else { + SingleCellExperiment::reducedDim(x,"decontX_UMAP") <- + result$estimates[[batchIndex]]$UMAP + } + } + + ## Save the rest of the result object into metadata result$decontXcounts <- NULL metadata(x)$decontX <- result - x + return(x) }) #' @export @@ -227,6 +250,7 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec verbose = verbose) runParams <- list( + z = z, batch = batch, maxIter = maxIter, delta = delta, From a6ba7f05ab8f12dd02c1e7aadef54eb44df5c4e4 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Wed, 8 Jan 2020 09:07:19 -0500 Subject: [PATCH 134/149] Fixed call to uwot::umap to ensure reprocibility. Used celdaUmap function instead of direct call --- R/decon.R | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/R/decon.R b/R/decon.R index 347a4d22..c4d131c1 100644 --- a/R/decon.R +++ b/R/decon.R @@ -910,17 +910,22 @@ addLogLikelihood <- function(llA, llB) { verbose = verbose ) ## Celda clustering using recursive module splitting - if (L < nrow(countsFiltered)) { - initialModuleSplit <- recursiveSplitModule(countsFiltered, - initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) - initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) - fm <- factorizeMatrix(countsFiltered, initialModel, type = "counts") - fm <- fm$counts$cell - rm(initialModuleSplit) - rm(initialModel) - } else { - fm <- countsFiltered - } + L <- min(L, nrow(countsFiltered)) + initialModuleSplit <- recursiveSplitModule(countsFiltered, + initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) + initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) + + #if (L < nrow(countsFiltered)) { + # initialModuleSplit <- recursiveSplitModule(countsFiltered, + # initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) + # initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) + # fm <- factorizeMatrix(countsFiltered, initialModel, type = "counts") + # fm <- fm$counts$cell + # rm(initialModuleSplit) + # rm(initialModel) + #} else { + # fm <- countsFiltered + #} .logMessages( date(), @@ -931,9 +936,11 @@ addLogLikelihood <- function(llA, llB) { ) ## Louvan graph-based method to reduce dimension into 2 cluster nNeighbors <- min(15, ncol(countsFiltered)) - resUmap <- uwot::umap(t(sqrt(fm)), n_neighbors = nNeighbors, - min_dist = 0.01, spread = 1) - rm(fm) + #resUmap <- uwot::umap(t(sqrt(fm)), n_neighbors = nNeighbors, + # min_dist = 0.01, spread = 1) + #rm(fm) + resUmap <- celdaUmap(countsFiltered, initialModel, + minDist = 0.01, spread = 1, nNeighbors = nNeighbors) .logMessages( date(), From bde91bf9bf301a5c593fa2683a4c77bb7a2a9e38 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Wed, 8 Jan 2020 09:15:39 -0500 Subject: [PATCH 135/149] Fixed decontX test units --- tests/testthat/test-decon.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-decon.R b/tests/testthat/test-decon.R index 936cf5b5..bf0423aa 100644 --- a/tests/testthat/test-decon.R +++ b/tests/testthat/test-decon.R @@ -29,15 +29,15 @@ test_that(desc = "Testing simulateContaminatedMatrix", { ## .decontXoneBatch test_that(desc = "Testing .decontXoneBatch", { - expect_error(decontX(counts = deconSim$observedCounts, + expect_error(decontX(x = deconSim$observedCounts, z = deconSim$z, delta = -1), "'delta' should be a single positive value.") - expect_error(decontX(counts = deconSim$observedCounts, + expect_error(decontX(x = deconSim$observedCounts, z = deconSim$z, delta = c(1, 1)), "'delta' should be a single positive value.") - expect_error(decontX(counts = deconSim$observedCounts, + expect_error(decontX(x = deconSim$observedCounts, z = c(deconSim$z, 1)), paste0("'z' must be of the same length as the number of cells in the", " 'counts' matrix.")) @@ -56,7 +56,7 @@ test_that(desc = "Testing .decontXoneBatch", { #test_that(desc = "Testing logLikelihood.DecontXoneBatch", { # z.process = processCellLabels(deconSim$z, # num.cells=ncol(deconSim$observedCounts) ) - # expect_equal( decon.calcLL(counts=deconSim$observedCounts, z=z.process , + # expect_equal( decon.calcLL(x=deconSim$observedCounts, z=z.process , # theta=modelDecontXoneBatch$resList$theta, # eta=modelDecontXoneBatch$resList$est.ConDist, # phi=modelDecontXoneBatch$resList$est.GeneDist ), From bae21ac5dac21dc0789b9cb34fbfed7925b60732 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Wed, 8 Jan 2020 09:21:58 -0500 Subject: [PATCH 136/149] Updated decontX docs --- R/decon.R | 13 ++++++++----- man/decontX.Rd | 13 ++++++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/R/decon.R b/R/decon.R index c4d131c1..ba0f950d 100644 --- a/R/decon.R +++ b/R/decon.R @@ -28,8 +28,8 @@ #' @param delta Numeric. Symmetric Dirichlet concentration parameter #' to initialize theta. Default 10. #' @param varGenes Integer. The number of variable genes to use in -#' Celda clustering. Variability is calcualted using '\link[scran]{modelGeneVar}' -#' function from the 'scran' package. +#' Celda clustering. Variability is calcualted using +#' \code{\link[scran]{modelGeneVar}} function from the 'scran' package. #' Used only when z is not provided. Default 5000. #' @param L Integer. Number of modules for Celda clustering. Used to reduce #' the dimensionality of the dataset before applying UMAP and dbscan. @@ -49,8 +49,9 @@ #' \describe{ #' \item{\code{decontXcounts}:}{The decontaminated count matrix.} #' \item{\code{contamination}:}{Percentage of contamination in each cell.} -#' \item{\code{estimates}:}{Estimated probability distributions -#' for each batch.} +#' \item{\code{estimates}:}{List of estimated parameters for each batch. If z +#' was not supplied, then the UMAP coordinates used to generated cell +#' cluster labels will also be stored here.} #' \item{\code{z}:}{Cell population/cluster labels used for analysis.} #' \item{\code{runParams}:}{List of arguments used in the function call.} #' } @@ -59,7 +60,9 @@ #' counts will be stored as an assay and can be accessed with #' \code{decontXcounts(x)}. The contamination values and cluster labels #' will be stored in \code{colData(x)}. \code{estimates} and \code{runParams} -#' will be stored in \code{metadata(x)$decontX}. +#' will be stored in \code{metadata(x)$decontX}. If z was not supplied, then +#' the UMAPs used to generated cell cluster labels will be stored in +#' \code{reducedDims} slot in \code{x} #' #' @examples #' s <- simulateContaminatedMatrix() diff --git a/man/decontX.Rd b/man/decontX.Rd index 58d25b79..2964ec80 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -71,8 +71,8 @@ current iterations is less than this. Default 0.001.} iteration. Default 10.} \item{varGenes}{Integer. The number of variable genes to use in -Celda clustering. Variability is calcualted using '\link[scran]{modelGeneVar}' -function from the 'scran' package. +Celda clustering. Variability is calcualted using +\code{\link[scran]{modelGeneVar}} function from the 'scran' package. Used only when z is not provided. Default 5000.} \item{dbscanEps}{Numeric. The clustering resolution parameter @@ -98,8 +98,9 @@ with the following items: \describe{ \item{\code{decontXcounts}:}{The decontaminated count matrix.} \item{\code{contamination}:}{Percentage of contamination in each cell.} -\item{\code{estimates}:}{Estimated probability distributions -for each batch.} +\item{\code{estimates}:}{List of estimated parameters for each batch. If z +was not supplied, then the UMAP coordinates used to generated cell +cluster labels will also be stored here.} \item{\code{z}:}{Cell population/cluster labels used for analysis.} \item{\code{runParams}:}{List of arguments used in the function call.} } @@ -108,7 +109,9 @@ If \code{x} is a \linkS4class{SingleCellExperiment}, then the decontaminated counts will be stored as an assay and can be accessed with \code{decontXcounts(x)}. The contamination values and cluster labels will be stored in \code{colData(x)}. \code{estimates} and \code{runParams} -will be stored in \code{metadata(x)$decontX}. +will be stored in \code{metadata(x)$decontX}. If z was not supplied, then +the UMAPs used to generated cell cluster labels will be stored in +\code{reducedDims} slot in \code{x} } \description{ Identifies contamination from factors such as ambient RNA From 2e1d21f9ead7fe0c7df7deb2dda45a13198539d8 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Wed, 8 Jan 2020 12:28:51 -0500 Subject: [PATCH 137/149] Added code to attempt to convert decontaminated matrix to format/class of original matrix. Also updated docs --- R/decon.R | 74 ++++++++++++++++++++++++++++++++++---------------- man/decontX.Rd | 6 ++-- 2 files changed, 55 insertions(+), 25 deletions(-) diff --git a/R/decon.R b/R/decon.R index ba0f950d..d30dd03b 100644 --- a/R/decon.R +++ b/R/decon.R @@ -5,8 +5,10 @@ #' #' @name decontX #' -#' @param x A numeric matrix of counts, or a \linkS4class{SingleCellExperiment} -#' containing such a matrix in the 'assayName' assay. +#' @param x A numeric matrix of counts or a \linkS4class{SingleCellExperiment} +#' with the matrix located in the assay slot under 'assayName'. \code{x} will +#' be converted to a sparse matrix of class "dgCMatrix" from the +#' \code{\link[Matrix]} package. #' @param assayName Character. Name of the assay to use if \code{x} is a #' \linkS4class{SingleCellExperiment}. #' @param z Numeric or character vector. Cell cluster labels. If NULL, @@ -117,15 +119,7 @@ setMethod("decontX", "SingleCellExperiment", function(x, colData(x) = cbind(colData(x), decontX_Contamination = result$contamination, decontX_Clusters = result$z) - - ## Add new matrix into assay slot wiht same class as original counts - if(class(mat) == "DelayedMatrix") { - decontXcounts(x) <- DelayedArray::DelayedArray(result$decontXcounts) - } else { - SummarizedExperiment::assay(x, "decontXcounts") <- - as(result$decontXcounts, class(mat)) - } - + ## Put estimated UMAPs into SCE if z was estimated with Celda/UMAP if (is.null(result$runParams$z)) { batchIndex <- unique(result$runParams$batch) @@ -206,16 +200,19 @@ SET_FUN <- function(exprs_values, ...) { } #' @export -setGeneric("decontXcounts", function(object, ...) standardGeneric("decontXcounts")) +setGeneric("decontXcounts", function(object, ...) + standardGeneric("decontXcounts")) #' @export -setGeneric("decontXcounts<-", function(object, ..., value) standardGeneric("decontXcounts<-")) +setGeneric("decontXcounts<-", function(object, ..., value) + standardGeneric("decontXcounts<-")) #' @export setMethod("decontXcounts", "SingleCellExperiment", GET_FUN("decontXcounts")) #' @export -setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("decontXcounts")) +setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), + SET_FUN("decontXcounts")) @@ -419,6 +416,37 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec "z" = returnZ ) + ## Try to convert class of new matrix to class of original matrix + if (class(counts) != "dgCMatrix") { + .logMessages( + date(), + ".. Finalizing decontaminated matrix", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + } + + if (class(counts) %in% c("DelayedMatrix", "DelayedArray")) { + + ## Determine class of seed in DelayedArray + seed.class <- unique(DelayedArray::seedApply(counts, class))[[1]] + if (seed.class == "HDF5ArraySeed") { + returnResult$decontXcounts <- as(returnResult$decontXcounts, "HDF5Matrix") + } else { + if (isTRUE(canCoerce(returnResult$decontXcounts, seed.class))) { + returnResult$decontXcounts <- as(returnResult$decontXcounts, seed.class) + } + } + returnResult$decontXcounts <- + DelayedArray::DelayedArray(returnResult$decontXcounts) + + } else if (canCoerce(result$decontXcounts, class(counts))) { + + returnResult$decontXcounts <- as(returnResult$decontXcounts, class(counts)) + + } + # } else { ## When there is only one batch # returnResult <- .decontXoneBatch( @@ -694,7 +722,7 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), SET_FUN("dec return(ll) } -# DEPRECATED. This is not used, but is kept as it might be useful in the future. +# DEPRECATED. This is not used, but is kept as it might be useful in the future # This function calculates the log-likelihood of background distribution # decontamination # bgDist Numeric matrix. Rows represent feature and columns are the times that @@ -919,15 +947,15 @@ addLogLikelihood <- function(llA, llB) { initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) #if (L < nrow(countsFiltered)) { - # initialModuleSplit <- recursiveSplitModule(countsFiltered, - # initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) - # initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) - # fm <- factorizeMatrix(countsFiltered, initialModel, type = "counts") - # fm <- fm$counts$cell - # rm(initialModuleSplit) - # rm(initialModel) + # initialModuleSplit <- recursiveSplitModule(countsFiltered, + # initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) + # initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) + # fm <- factorizeMatrix(countsFiltered, initialModel, type = "counts") + # fm <- fm$counts$cell + # rm(initialModuleSplit) + # rm(initialModel) #} else { - # fm <- countsFiltered + # fm <- countsFiltered #} .logMessages( diff --git a/man/decontX.Rd b/man/decontX.Rd index 2964ec80..2c40aa7a 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -40,8 +40,10 @@ ) } \arguments{ -\item{x}{A numeric matrix of counts, or a \linkS4class{SingleCellExperiment} -containing such a matrix in the 'assayName' assay.} +\item{x}{A numeric matrix of counts or a \linkS4class{SingleCellExperiment} +with the matrix located in the assay slot under 'assayName'. \code{x} will +be converted to a sparse matrix of class "dgCMatrix" from the +\code{\link[Matrix]} package.} \item{assayName}{Character. Name of the assay to use if \code{x} is a \linkS4class{SingleCellExperiment}.} From ec792981c04c4c162b43533d5ff76001f512ce8b Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Wed, 8 Jan 2020 12:47:36 -0500 Subject: [PATCH 138/149] Fixed incorrect variable name --- R/decon.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/decon.R b/R/decon.R index d30dd03b..e6c2f4a2 100644 --- a/R/decon.R +++ b/R/decon.R @@ -441,7 +441,7 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), returnResult$decontXcounts <- DelayedArray::DelayedArray(returnResult$decontXcounts) - } else if (canCoerce(result$decontXcounts, class(counts))) { + } else if (canCoerce(returnResult$decontXcounts, class(counts))) { returnResult$decontXcounts <- as(returnResult$decontXcounts, class(counts)) From d8d8236fef62be79713f0c295fb2142ee3756118 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Wed, 8 Jan 2020 13:22:22 -0500 Subject: [PATCH 139/149] Changed check for class to use function 'inherits'. Updated docs --- R/decon.R | 10 +++++----- man/decontX.Rd | 6 +++--- tests/testthat/test-decon.R | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/R/decon.R b/R/decon.R index e6c2f4a2..e9ed8d0e 100644 --- a/R/decon.R +++ b/R/decon.R @@ -6,9 +6,9 @@ #' @name decontX #' #' @param x A numeric matrix of counts or a \linkS4class{SingleCellExperiment} -#' with the matrix located in the assay slot under 'assayName'. \code{x} will -#' be converted to a sparse matrix of class "dgCMatrix" from the -#' \code{\link[Matrix]} package. +#' with the matrix located in the assay slot under \code{assayName}. +#' Cells in each batch will be subsetted and converted to a sparse matrix +#' of class \code{dgCMatrix} from package \link{Matrix} before analysis. #' @param assayName Character. Name of the assay to use if \code{x} is a #' \linkS4class{SingleCellExperiment}. #' @param z Numeric or character vector. Cell cluster labels. If NULL, @@ -417,7 +417,7 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), ) ## Try to convert class of new matrix to class of original matrix - if (class(counts) != "dgCMatrix") { + if (inherits(counts, "dgCMatrix")) { .logMessages( date(), ".. Finalizing decontaminated matrix", @@ -427,7 +427,7 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), ) } - if (class(counts) %in% c("DelayedMatrix", "DelayedArray")) { + if (inherits(counts, c("DelayedMatrix", "DelayedArray"))) { ## Determine class of seed in DelayedArray seed.class <- unique(DelayedArray::seedApply(counts, class))[[1]] diff --git a/man/decontX.Rd b/man/decontX.Rd index 2c40aa7a..21b1c1f9 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -41,9 +41,9 @@ } \arguments{ \item{x}{A numeric matrix of counts or a \linkS4class{SingleCellExperiment} -with the matrix located in the assay slot under 'assayName'. \code{x} will -be converted to a sparse matrix of class "dgCMatrix" from the -\code{\link[Matrix]} package.} +with the matrix located in the assay slot under \code{assayName}. +Cells in each batch will be subsetted and converted to a sparse matrix +of class \code{dgCMatrix} from package \link{Matrix} before analysis.} \item{assayName}{Character. Name of the assay to use if \code{x} is a \linkS4class{SingleCellExperiment}.} diff --git a/tests/testthat/test-decon.R b/tests/testthat/test-decon.R index bf0423aa..f5baa4ba 100644 --- a/tests/testthat/test-decon.R +++ b/tests/testthat/test-decon.R @@ -3,7 +3,7 @@ library(celda) context("Testing Deconx") deconSim <- simulateContaminatedMatrix(K = 10, delta = c(1, 5)) -modelDecontXoneBatch <- .decontX(deconSim$observedCounts, +modelDecontXoneBatch <- decontX(deconSim$observedCounts, z = deconSim$z, maxIter = 2) From 30f227cd1890f87ba3ad582e43b18be039d2272c Mon Sep 17 00:00:00 2001 From: zhewa Date: Wed, 8 Jan 2020 13:29:33 -0500 Subject: [PATCH 140/149] fix lints --- R/decon.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/decon.R b/R/decon.R index b6ec8faf..5a451157 100644 --- a/R/decon.R +++ b/R/decon.R @@ -1,5 +1,4 @@ - #' @title Simulate contaminated count matrix #' @description This function generates a list containing two count matrices -- #' one for real expression, the other one for contamination, as well as other @@ -189,9 +188,9 @@ simulateContaminatedMatrix <- function(C = 300, rnGByK <- .colSumByGroupNumeric(estRmat, z, K) cnGByK <- rowSums(rnGByK) - rnGByK - TNbyC <- colSums(counts) + TNbyC <- colSums(counts) estRbyCol <- colSums(estRmat) - + PrbyC <- estRbyCol / TNbyC PcbyC <- 1 - PrbyC deltaV2 <- MCMCprecision::fit_dirichlet(cbind(PrbyC, PcbyC))$alpha From 885a9d6875926f1806156ae39b0f3629736b74f2 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Wed, 8 Jan 2020 13:48:09 -0500 Subject: [PATCH 141/149] Temporarily removed conversion code --- R/decon.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/decon.R b/R/decon.R index e9ed8d0e..bdc52feb 100644 --- a/R/decon.R +++ b/R/decon.R @@ -441,11 +441,11 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), returnResult$decontXcounts <- DelayedArray::DelayedArray(returnResult$decontXcounts) - } else if (canCoerce(returnResult$decontXcounts, class(counts))) { + } #else if (canCoerce(returnResult$decontXcounts, class(counts))) { - returnResult$decontXcounts <- as(returnResult$decontXcounts, class(counts)) + #returnResult$decontXcounts <- as(returnResult$decontXcounts, class(counts)) - } + #} # } else { ## When there is only one batch From 1c2ecd06f67fc501305363f243d9606da13cfe62 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Wed, 8 Jan 2020 15:52:31 -0500 Subject: [PATCH 142/149] Added 'try' statement around conversion as a backup in case 'canCoerce' produces an error --- R/decon.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/R/decon.R b/R/decon.R index bdc52feb..3c660715 100644 --- a/R/decon.R +++ b/R/decon.R @@ -441,11 +441,14 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), returnResult$decontXcounts <- DelayedArray::DelayedArray(returnResult$decontXcounts) - } #else if (canCoerce(returnResult$decontXcounts, class(counts))) { - - #returnResult$decontXcounts <- as(returnResult$decontXcounts, class(counts)) - - #} + } else { + try({ + if (canCoerce(returnResult$decontXcounts, class(counts))) { + returnResult$decontXcounts <- + as(returnResult$decontXcounts, class(counts)) + } + }) + } # } else { ## When there is only one batch From 00b0922f041cb9f3d506bb179041045b47ac163b Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Wed, 8 Jan 2020 16:36:57 -0500 Subject: [PATCH 143/149] set silent=TRUE in try statement --- R/decon.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/decon.R b/R/decon.R index 3c660715..2df3a718 100644 --- a/R/decon.R +++ b/R/decon.R @@ -447,7 +447,7 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), returnResult$decontXcounts <- as(returnResult$decontXcounts, class(counts)) } - }) + }, silent = TRUE) } From 091a249c59417bff21631044b4d062cde8af685e Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Thu, 9 Jan 2020 16:53:33 -0500 Subject: [PATCH 144/149] Added decontXcounts to SCE. Updated docs --- R/decon.R | 6 +++++- man/decontX.Rd | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/R/decon.R b/R/decon.R index 2df3a718..1d0c6473 100644 --- a/R/decon.R +++ b/R/decon.R @@ -49,7 +49,10 @@ #' @return If \code{x} is a matrix-like object, a list will be returned #' with the following items: #' \describe{ -#' \item{\code{decontXcounts}:}{The decontaminated count matrix.} +#' \item{\code{decontXcounts}:}{The decontaminated matrix. Values obtained +#' from the variational inference procedure may be non-integer. However, +#' integer counts can be obtained by rounding, +#' e.g. \code{round(decontXcounts)}.} #' \item{\code{contamination}:}{Percentage of contamination in each cell.} #' \item{\code{estimates}:}{List of estimated parameters for each batch. If z #' was not supplied, then the UMAP coordinates used to generated cell @@ -144,6 +147,7 @@ setMethod("decontX", "SingleCellExperiment", function(x, ## Save the rest of the result object into metadata + decontXcounts(x) <- result$decontXcounts result$decontXcounts <- NULL metadata(x)$decontX <- result diff --git a/man/decontX.Rd b/man/decontX.Rd index 21b1c1f9..29692266 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -98,7 +98,10 @@ a default value of 12345 is used. If NULL, no calls to If \code{x} is a matrix-like object, a list will be returned with the following items: \describe{ -\item{\code{decontXcounts}:}{The decontaminated count matrix.} +\item{\code{decontXcounts}:}{The decontaminated matrix. Values obtained +from the variational inference procedure may be non-integer. However, +integer counts can be obtained by rounding, +e.g. \code{round(decontXcounts)}.} \item{\code{contamination}:}{Percentage of contamination in each cell.} \item{\code{estimates}:}{List of estimated parameters for each batch. If z was not supplied, then the UMAP coordinates used to generated cell From 7f0014a712c26299b8c435199346033630ca2e93 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Thu, 9 Jan 2020 21:11:15 -0500 Subject: [PATCH 145/149] applied styler --- R/decon.R | 1806 +++++++++++++++++++++++++++-------------------------- 1 file changed, 931 insertions(+), 875 deletions(-) diff --git a/R/decon.R b/R/decon.R index 1d0c6473..857bd8b7 100644 --- a/R/decon.R +++ b/R/decon.R @@ -1,5 +1,5 @@ #' @title Contamination estimation with decontX -#' +#' #' @description Identifies contamination from factors such as ambient RNA #' in single cell genomic datasets. #' @@ -13,7 +13,7 @@ #' \linkS4class{SingleCellExperiment}. #' @param z Numeric or character vector. Cell cluster labels. If NULL, #' Celda will be used to reduce the dimensionality of the dataset -#' to 'L' modules, '\link[uwot]{umap}' from the 'uwot' package +#' to 'L' modules, '\link[uwot]{umap}' from the 'uwot' package #' will be used to further reduce the dataset to 2 dimenions and #' the '\link[dbscan]{dbscan}' function from the 'dbscan' package #' will be used to identify clusters of broad cell types. Default NULL. @@ -62,7 +62,7 @@ #' } #' #' If \code{x} is a \linkS4class{SingleCellExperiment}, then the decontaminated -#' counts will be stored as an assay and can be accessed with +#' counts will be stored as an assay and can be accessed with #' \code{decontXcounts(x)}. The contamination values and cluster labels #' will be stored in \code{colData(x)}. \code{estimates} and \code{runParams} #' will be stored in \code{metadata(x)$decontX}. If z was not supplied, then @@ -70,11 +70,11 @@ #' \code{reducedDims} slot in \code{x} #' #' @examples -#' s <- simulateContaminatedMatrix() -#' result <- decontX(s$observedCounts, s$z) -#' contamination <- colSums(s$observedCounts - s$nativeCounts) / -#' colSums(s$observedCounts) -#' plot(contamination, result$contamination) +#' s <- simulateContaminatedMatrix() +#' result <- decontX(s$observedCounts, s$z) +#' contamination <- colSums(s$observedCounts - s$nativeCounts) / +#' colSums(s$observedCounts) +#' plot(contamination, result$contamination) NULL #' @export @@ -89,134 +89,143 @@ setGeneric("decontX", function(x, ...) standardGeneric("decontX")) #' @export #' @rdname decontX setMethod("decontX", "SingleCellExperiment", function(x, - assayName="counts", - z = NULL, - batch = NULL, - maxIter = 500, - delta = 10, - convergence = 0.001, - iterLogLik = 10, - varGenes = 5000, - dbscanEps = 1, - L = 50, - seed = 12345, - logfile = NULL, - verbose = TRUE) -{ - mat <- SummarizedExperiment::assay(x, i=assayName) - result <- .decontX(counts = mat, - z = z, - batch = batch, - maxIter = maxIter, - convergence = convergence, - iterLogLik = iterLogLik, - delta = delta, - varGenes = varGenes, - L = L, - dbscanEps = dbscanEps, - seed = seed, - logfile = logfile, - verbose = verbose) - + assayName = "counts", + z = NULL, + batch = NULL, + maxIter = 500, + delta = 10, + convergence = 0.001, + iterLogLik = 10, + varGenes = 5000, + dbscanEps = 1, + L = 50, + seed = 12345, + logfile = NULL, + verbose = TRUE) { + mat <- SummarizedExperiment::assay(x, i = assayName) + result <- .decontX( + counts = mat, + z = z, + batch = batch, + maxIter = maxIter, + convergence = convergence, + iterLogLik = iterLogLik, + delta = delta, + varGenes = varGenes, + L = L, + dbscanEps = dbscanEps, + seed = seed, + logfile = logfile, + verbose = verbose + ) + ## Add results into column annotation - colData(x) = cbind(colData(x), - decontX_Contamination = result$contamination, - decontX_Clusters = result$z) + colData(x) <- cbind(colData(x), + decontX_Contamination = result$contamination, + decontX_Clusters = result$z + ) ## Put estimated UMAPs into SCE if z was estimated with Celda/UMAP if (is.null(result$runParams$z)) { batchIndex <- unique(result$runParams$batch) if (length(batchIndex) > 1) { - for(i in batchIndex) { - + for (i in batchIndex) { + ## Each individual UMAP will only be for one batch so need ## to put NAs in for cells in other batches tempUMAP <- matrix(NA, ncol = 2, nrow = ncol(mat)) - tempUMAP[result$runParams$batch == i,] <- result$estimates[[i]]$UMAP + tempUMAP[result$runParams$batch == i, ] <- result$estimates[[i]]$UMAP colnames(tempUMAP) <- c("UMAP_1", "UMAP_2") rownames(tempUMAP) <- colnames(mat) - - SingleCellExperiment::reducedDim(x, - paste("decontX", i, "UMAP", sep="_")) <- tempUMAP + + SingleCellExperiment::reducedDim( + x, + paste("decontX", i, "UMAP", sep = "_") + ) <- tempUMAP } } else { - SingleCellExperiment::reducedDim(x,"decontX_UMAP") <- - result$estimates[[batchIndex]]$UMAP + SingleCellExperiment::reducedDim(x, "decontX_UMAP") <- + result$estimates[[batchIndex]]$UMAP } - } - + } + ## Save the rest of the result object into metadata decontXcounts(x) <- result$decontXcounts result$decontXcounts <- NULL metadata(x)$decontX <- result - + return(x) -}) +}) #' @export #' @rdname decontX setMethod("decontX", "ANY", function(x, - z = NULL, - batch = NULL, - maxIter = 500, - delta = 10, - convergence = 0.001, - iterLogLik = 10, - varGenes = 5000, - dbscanEps = 1, - L = 50, - seed = 12345, - logfile = NULL, - verbose = TRUE) -{ - .decontX(counts = x, - z = z, - batch = batch, - maxIter = maxIter, - convergence = convergence, - iterLogLik = iterLogLik, - delta = delta, - varGenes = varGenes, - L = L, - dbscanEps = dbscanEps, - seed = seed, - logfile = logfile, - verbose = verbose) + z = NULL, + batch = NULL, + maxIter = 500, + delta = 10, + convergence = 0.001, + iterLogLik = 10, + varGenes = 5000, + dbscanEps = 1, + L = 50, + seed = 12345, + logfile = NULL, + verbose = TRUE) { + .decontX( + counts = x, + z = z, + batch = batch, + maxIter = maxIter, + convergence = convergence, + iterLogLik = iterLogLik, + delta = delta, + varGenes = varGenes, + L = L, + dbscanEps = dbscanEps, + seed = seed, + logfile = logfile, + verbose = verbose + ) }) ## Copied from SingleCellExperiment Package GET_FUN <- function(exprs_values, ...) { - (exprs_values) # To ensure evaluation - function(object, ...) { - assay(object, i=exprs_values, ...) - } + (exprs_values) # To ensure evaluation + function(object, ...) { + assay(object, i = exprs_values, ...) + } } SET_FUN <- function(exprs_values, ...) { - (exprs_values) # To ensure evaluation - function(object, ..., value) { - assay(object, i=exprs_values, ...) <- value - object - } + (exprs_values) # To ensure evaluation + function(object, ..., value) { + assay(object, i = exprs_values, ...) <- value + object + } } #' @export -setGeneric("decontXcounts", function(object, ...) - standardGeneric("decontXcounts")) +setGeneric("decontXcounts", function(object, ...) { + standardGeneric("decontXcounts") +}) #' @export -setGeneric("decontXcounts<-", function(object, ..., value) - standardGeneric("decontXcounts<-")) +setGeneric("decontXcounts<-", function(object, ..., value) { + standardGeneric("decontXcounts<-") +}) #' @export setMethod("decontXcounts", "SingleCellExperiment", GET_FUN("decontXcounts")) #' @export -setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), - SET_FUN("decontXcounts")) +setReplaceMethod( + "decontXcounts", c("SingleCellExperiment", "ANY"), + SET_FUN("decontXcounts") +) @@ -226,485 +235,504 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), ########################## .decontX <- function(counts, - z = NULL, - batch = NULL, - maxIter = 200, - convergence = 0.001, - iterLogLik = 10, - delta = 10, - varGenes = NULL, - L = NULL, - dbscanEps = NULL, - seed = 12345, - logfile = NULL, - verbose = TRUE) { - - startTime <- Sys.time() - .logMessages(paste(rep("-", 50), collapse = ""), + z = NULL, + batch = NULL, + maxIter = 200, + convergence = 0.001, + iterLogLik = 10, + delta = 10, + varGenes = NULL, + L = NULL, + dbscanEps = NULL, + seed = 12345, + logfile = NULL, + verbose = TRUE) { + startTime <- Sys.time() + .logMessages(paste(rep("-", 50), collapse = ""), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + .logMessages("Starting DecontX", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + .logMessages(paste(rep("-", 50), collapse = ""), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + + runParams <- list( + z = z, + batch = batch, + maxIter = maxIter, + delta = delta, + convergence = convergence, + varGenes = varGenes, + L = L, + dbscanEps = dbscanEps, + logfile = logfile, + verbose = verbose + ) + + totalGenes <- nrow(counts) + totalCells <- ncol(counts) + geneNames <- rownames(counts) + nC <- ncol(counts) + allCellNames <- colnames(counts) + + ## Set up final deconaminated matrix + estRmat <- Matrix::Matrix( + data = 0, + ncol = totalCells, + nrow = totalGenes, + sparse = TRUE, + dimnames = list(geneNames, allCellNames) + ) + + ## Generate batch labels if none were supplied + if (is.null(batch)) { + batch <- rep("all_cells", nC) + } + runParams$batch <- batch + batchIndex <- unique(batch) + + ## Set result lists upfront for all cells from different batches + logLikelihood <- c() + estConp <- rep(NA, nC) + returnZ <- rep(NA, nC) + resBatch <- list() + + ## Cycle through each sample/batch and run DecontX + for (bat in batchIndex) { + if (length(batchIndex) == 1) { + .logMessages( + date(), + ".. Analyzing all cells", logfile = logfile, append = TRUE, - verbose = verbose) - .logMessages("Starting DecontX", + verbose = verbose + ) + } else { + .logMessages( + date(), + " .. Analyzing cells in batch '", + bat, "'", + sep = "", logfile = logfile, append = TRUE, - verbose = verbose) - .logMessages(paste(rep("-", 50), collapse = ""), + verbose = verbose + ) + } + + zBat <- NULL + countsBat <- counts[, batch == bat] + + ## Convert to sparse matrix + if (!inherits(countsBat, "dgCMatrix")) { + .logMessages( + date(), + ".... Converting to sparse matrix", logfile = logfile, append = TRUE, - verbose = verbose) + verbose = verbose + ) + countsBat <- as(countsBat, "dgCMatrix") + } - runParams <- list( - z = z, - batch = batch, + + if (!is.null(z)) { + zBat <- z[batch == bat] + } + if (is.null(seed)) { + res <- .decontXoneBatch( + counts = countsBat, + z = zBat, + batch = bat, maxIter = maxIter, delta = delta, convergence = convergence, + iterLogLik = iterLogLik, + logfile = logfile, + verbose = verbose, varGenes = varGenes, - L = L, dbscanEps = dbscanEps, - logfile = logfile, - verbose = verbose) - - totalGenes <- nrow(counts) - totalCells <- ncol(counts) - geneNames <- rownames(counts) - nC <- ncol(counts) - allCellNames <- colnames(counts) - - ## Set up final deconaminated matrix - estRmat <- Matrix::Matrix( - data = 0, - ncol = totalCells, - nrow = totalGenes, - sparse = TRUE, - dimnames = list(geneNames, allCellNames) + L = L + ) + } else { + withr::with_seed( + seed, + res <- .decontXoneBatch( + counts = countsBat, + z = zBat, + batch = bat, + maxIter = maxIter, + delta = delta, + convergence = convergence, + iterLogLik = iterLogLik, + logfile = logfile, + verbose = verbose, + varGenes = varGenes, + dbscanEps = dbscanEps, + L = L + ) + ) + } + estRmat <- calculateNativeMatrix( + counts = countsBat, + native_counts = estRmat, + theta = res$theta, + eta = res$eta, + row_index = seq(nrow(counts)), + col_index = which(batch == bat), + phi = res$phi, + z = as.integer(res$z), + pseudocount = 1e-20 ) - ## Generate batch labels if none were supplied - if (is.null(batch)) { - batch <- rep("all_cells", nC) + resBatch[[bat]] <- list( + z = res$z, + phi = res$phi, + eta = res$eta, + delta = res$delta, + theta = res$theta, + logLikelihood = res$logLikelihood, + UMAP = res$UMAP, + z = res$z, + iteration = res$iteration + ) + + estConp[batch == bat] <- res$contamination + if (length(batchIndex) > 1) { + returnZ[batch == bat] <- paste0(bat, "-", res$z) + } else { + returnZ[batch == bat] <- res$z } - runParams$batch <- batch - batchIndex <- unique(batch) - - ## Set result lists upfront for all cells from different batches - logLikelihood <- c() - estConp <- rep(NA, nC) - returnZ <- rep(NA, nC) - resBatch <- list() - - ## Cycle through each sample/batch and run DecontX - for (bat in batchIndex) { - - if(length(batchIndex) == 1) { - .logMessages( - date(), - ".. Analyzing all cells", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - } else { - .logMessages( - date(), - " .. Analyzing cells in batch '", - bat, "'", - sep = "", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - } - - zBat <- NULL - countsBat <- counts[, batch == bat] - - ## Convert to sparse matrix - if (!inherits(countsBat, "dgCMatrix")) { - .logMessages( - date(), - ".... Converting to sparse matrix", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - countsBat <- as(countsBat, "dgCMatrix") - } - - if (!is.null(z)) { - zBat <- z[batch == bat] - } - if (is.null(seed)) { - res <- .decontXoneBatch( - counts = countsBat, - z = zBat, - batch = bat, - maxIter = maxIter, - delta = delta, - convergence = convergence, - iterLogLik = iterLogLik, - logfile = logfile, - verbose = verbose, - varGenes = varGenes, - dbscanEps = dbscanEps, - L = L) - } else { - withr::with_seed(seed, - res <- .decontXoneBatch( - counts = countsBat, - z = zBat, - batch = bat, - maxIter = maxIter, - delta = delta, - convergence = convergence, - iterLogLik = iterLogLik, - logfile = logfile, - verbose = verbose, - varGenes = varGenes, - dbscanEps = dbscanEps, - L = L) - ) - } - estRmat <- calculateNativeMatrix( - counts = countsBat, - native_counts = estRmat, - theta = res$theta, - eta = res$eta, - row_index = seq(nrow(counts)), - col_index = which(batch == bat), - phi = res$phi, - z = as.integer(res$z), - pseudocount = 1e-20 - ) - - resBatch[[bat]] <- list( - z = res$z, - phi = res$phi, - eta = res$eta, - delta = res$delta, - theta = res$theta, - logLikelihood = res$logLikelihood, - UMAP = res$UMAP, - z = res$z, - iteration = res$iteration - ) - - estConp[batch == bat] <- res$contamination - if(length(batchIndex) > 1) { - returnZ[batch == bat] <- paste0(bat, "-", res$z) - } else { - returnZ[batch == bat] <- res$z - } - -# if (is.null(logLikelihood)) { -# logLikelihood <- resBat$resList$logLikelihood -# } else { -# logLikelihood <- addLogLikelihood(logLikelihood, -# resBat$resList$logLikelihood) -# } - } - names(resBatch) <- batchIndex - -# runParams <- res$runParams - ## All batches share the same other parameters except cluster label z - ## So update z in the final returned result -# runParams$z <- returnZ -# method <- res$method - - returnResult <- list( - "runParams" = runParams, - "estimates" = resBatch, - "decontXcounts" = estRmat, - "contamination" = estConp, - "z" = returnZ - ) - - ## Try to convert class of new matrix to class of original matrix - if (inherits(counts, "dgCMatrix")) { - .logMessages( - date(), - ".. Finalizing decontaminated matrix", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - } - - if (inherits(counts, c("DelayedMatrix", "DelayedArray"))) { - - ## Determine class of seed in DelayedArray - seed.class <- unique(DelayedArray::seedApply(counts, class))[[1]] - if (seed.class == "HDF5ArraySeed") { - returnResult$decontXcounts <- as(returnResult$decontXcounts, "HDF5Matrix") - } else { - if (isTRUE(canCoerce(returnResult$decontXcounts, seed.class))) { - returnResult$decontXcounts <- as(returnResult$decontXcounts, seed.class) - } - } - returnResult$decontXcounts <- - DelayedArray::DelayedArray(returnResult$decontXcounts) + # if (is.null(logLikelihood)) { + # logLikelihood <- resBat$resList$logLikelihood + # } else { + # logLikelihood <- addLogLikelihood(logLikelihood, + # resBat$resList$logLikelihood) + # } + } + names(resBatch) <- batchIndex + + # runParams <- res$runParams + ## All batches share the same other parameters except cluster label z + ## So update z in the final returned result + # runParams$z <- returnZ + # method <- res$method + + returnResult <- list( + "runParams" = runParams, + "estimates" = resBatch, + "decontXcounts" = estRmat, + "contamination" = estConp, + "z" = returnZ + ) + + ## Try to convert class of new matrix to class of original matrix + if (inherits(counts, "dgCMatrix")) { + .logMessages( + date(), + ".. Finalizing decontaminated matrix", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + } - } else { - try({ - if (canCoerce(returnResult$decontXcounts, class(counts))) { + if (inherits(counts, c("DelayedMatrix", "DelayedArray"))) { + + ## Determine class of seed in DelayedArray + seed.class <- unique(DelayedArray::seedApply(counts, class))[[1]] + if (seed.class == "HDF5ArraySeed") { + returnResult$decontXcounts <- as(returnResult$decontXcounts, "HDF5Matrix") + } else { + if (isTRUE(canCoerce(returnResult$decontXcounts, seed.class))) { + returnResult$decontXcounts <- as(returnResult$decontXcounts, seed.class) + } + } + returnResult$decontXcounts <- + DelayedArray::DelayedArray(returnResult$decontXcounts) + } else { + try( + { + if (canCoerce(returnResult$decontXcounts, class(counts))) { returnResult$decontXcounts <- - as(returnResult$decontXcounts, class(counts)) + as(returnResult$decontXcounts, class(counts)) } - }, silent = TRUE) - } - - -# } else { ## When there is only one batch -# returnResult <- .decontXoneBatch( -# counts = counts, -# z = z, -# maxIter = maxIter, -# delta = delta, -# convergence = convergence, -# logfile = logfile, -# verbose = verbose, -# varGenes = varGenes, -# dbscanEps = dbscanEps, -# L = L -# ) - -# estRmat <- calculateNativeMatrix( -# counts = counts, -# native_counts = estRmat, -# theta = returnResult$theta, -# eta = returnResult$eta, -# row_index = which(noneEmptyGeneIndex), -# col_index = seq(totalCells), -# phi = returnResult$phi, -# z = as.integer(returnResult$runParams$z), -# pseudocount = 1e-20) - -# returnResult$decontX_counts <- estRmat -# } - - endTime <- Sys.time() - .logMessages(paste(rep("-", 50), collapse = ""), - logfile = logfile, - append = TRUE, - verbose = verbose) - .logMessages("Completed DecontX. Total time:", - format(difftime(endTime, startTime)), - logfile = logfile, - append = TRUE, - verbose = verbose) - .logMessages(paste(rep("-", 50), collapse = ""), - logfile = logfile, - append = TRUE, - verbose = verbose) - - return(returnResult) - + }, + silent = TRUE + ) + } + + + # } else { ## When there is only one batch + # returnResult <- .decontXoneBatch( + # counts = counts, + # z = z, + # maxIter = maxIter, + # delta = delta, + # convergence = convergence, + # logfile = logfile, + # verbose = verbose, + # varGenes = varGenes, + # dbscanEps = dbscanEps, + # L = L + # ) + + # estRmat <- calculateNativeMatrix( + # counts = counts, + # native_counts = estRmat, + # theta = returnResult$theta, + # eta = returnResult$eta, + # row_index = which(noneEmptyGeneIndex), + # col_index = seq(totalCells), + # phi = returnResult$phi, + # z = as.integer(returnResult$runParams$z), + # pseudocount = 1e-20) + + # returnResult$decontX_counts <- estRmat + # } + + endTime <- Sys.time() + .logMessages(paste(rep("-", 50), collapse = ""), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + .logMessages("Completed DecontX. Total time:", + format(difftime(endTime, startTime)), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + .logMessages(paste(rep("-", 50), collapse = ""), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + + return(returnResult) } # This function updates decontamination for one batch .decontXoneBatch <- function(counts, - z = NULL, - batch = NULL, - maxIter = 200, - delta = 10, - convergence = 0.01, - iterLogLik = 10, - logfile = NULL, - verbose = TRUE, - varGenes = NULL, - dbscanEps = NULL, - L = NULL) { - .checkCountsDecon(counts) - .checkParametersDecon(proportionPrior = delta) - - # nG <- nrow(counts) - nC <- ncol(counts) - deconMethod <- "clustering" - - ## Generate cell cluster labels if none are provided - umap <- NULL - if (is.null(z)) { - .logMessages( - date(), - ".... Estimating cell types with Celda", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - ## Always uses clusters for DecontX estimation - #deconMethod <- "background" - - varGenes <- .processvarGenes(varGenes) - dbscanEps <- .processdbscanEps(dbscanEps) - L <- .processL(L) - - celda.init <- .decontxInitializeZ(object = counts, - varGenes = varGenes, - L = L, - dbscanEps = dbscanEps, - verbose = verbose, - logfile = logfile) - z <- celda.init$z - umap <- celda.init$umap - colnames(umap) <- c("DecontX_UMAP_1", - "DecontX_UMAP_2") - rownames(umap) <- colnames(counts) - } + z = NULL, + batch = NULL, + maxIter = 200, + delta = 10, + convergence = 0.01, + iterLogLik = 10, + logfile = NULL, + verbose = TRUE, + varGenes = NULL, + dbscanEps = NULL, + L = NULL) { + .checkCountsDecon(counts) + .checkParametersDecon(proportionPrior = delta) + + # nG <- nrow(counts) + nC <- ncol(counts) + deconMethod <- "clustering" + + ## Generate cell cluster labels if none are provided + umap <- NULL + if (is.null(z)) { + .logMessages( + date(), + ".... Estimating cell types with Celda", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + ## Always uses clusters for DecontX estimation + # deconMethod <- "background" - z <- .processCellLabels(z, numCells = nC) - K <- length(unique(z)) + varGenes <- .processvarGenes(varGenes) + dbscanEps <- .processdbscanEps(dbscanEps) + L <- .processL(L) - iter <- 1L - numIterWithoutImprovement <- 0L - stopIter <- 3L + celda.init <- .decontxInitializeZ( + object = counts, + varGenes = varGenes, + L = L, + dbscanEps = dbscanEps, + verbose = verbose, + logfile = logfile + ) + z <- celda.init$z + umap <- celda.init$umap + colnames(umap) <- c( + "DecontX_UMAP_1", + "DecontX_UMAP_2" + ) + rownames(umap) <- colnames(counts) + } + + z <- .processCellLabels(z, numCells = nC) + K <- length(unique(z)) + + iter <- 1L + numIterWithoutImprovement <- 0L + stopIter <- 3L + + .logMessages( + date(), + ".... Estimating contamination", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + + if (deconMethod == "clustering") { + ## Initialization + deltaInit <- delta + # theta = stats::runif(nC, min = 0.1, max = 0.5) + theta <- stats::rbeta( + n = nC, + shape1 = deltaInit, + shape2 = deltaInit + ) - .logMessages( - date(), - ".... Estimating contamination", - logfile = logfile, - append = TRUE, - verbose = verbose + + nextDecon <- decontXInitialize( + counts = counts, + theta = theta, + z = z, + pseudocount = 1e-20 + ) + phi <- nextDecon$phi + eta <- nextDecon$eta + + # estRmat <- Matrix::t(Matrix::t(counts) * theta) + # phi <- .colSumByGroupNumeric(as.matrix(estRmat), z, K) + # eta <- rowSums(phi) - phi + # phi <- normalizeCounts(phi, + # normalize = "proportion", + # pseudocountNormalize = 1e-20) + # eta <- normalizeCounts(eta, + # normalize = "proportion", + # pseudocountNormalize = 1e-20) + ll <- c() + + + # llRound <- .deconCalcLL( + # counts = counts, + # z = z, + # phi = phi, + # eta = eta, + # theta = theta + # ) + llRound <- decontXLogLik( + counts = counts, + z = z, + phi = phi, + eta = eta, + theta = theta, + pseudocount = 1e-20 ) - if (deconMethod == "clustering") { - ## Initialization - deltaInit <- delta - # theta = stats::runif(nC, min = 0.1, max = 0.5) - theta <- stats::rbeta(n = nC, - shape1 = deltaInit, - shape2 = deltaInit) - - - nextDecon <- decontXInitialize( - counts = counts, - theta = theta, - z = z, - pseudocount = 1e-20) - phi <- nextDecon$phi - eta <- nextDecon$eta - -# estRmat <- Matrix::t(Matrix::t(counts) * theta) -# phi <- .colSumByGroupNumeric(as.matrix(estRmat), z, K) -# eta <- rowSums(phi) - phi -# phi <- normalizeCounts(phi, -# normalize = "proportion", -# pseudocountNormalize = 1e-20) -# eta <- normalizeCounts(eta, -# normalize = "proportion", -# pseudocountNormalize = 1e-20) - ll <- c() - - -# llRound <- .deconCalcLL( -# counts = counts, -# z = z, -# phi = phi, -# eta = eta, -# theta = theta -# ) - llRound <- decontXLogLik( - counts = counts, - z = z, - phi = phi, - eta = eta, - theta = theta, - pseudocount = 1e-20) - - ## EM updates - theta.previous <- theta - converged <- FALSE - counts.colsums <- Matrix::colSums(counts) - while (iter <= maxIter & !isTRUE(converged) & - numIterWithoutImprovement <= stopIter) { -# nextDecon <- .cDCalcEMDecontamination( -# counts = counts, -# phi = phi, -# eta = eta, -# theta = theta, -# z = z, -# K = K, -# delta = delta -# ) - nextDecon <- decontXEM(counts = counts, - counts_colsums = counts.colsums, - phi = phi, - eta = eta, - theta = theta, - z = z, - pseudocount = 1e-20) - - theta <- nextDecon$theta - phi <- nextDecon$phi - eta <- nextDecon$eta - delta <- nextDecon$delta - - ## Calculate log-likelihood -# llTemp <- .deconCalcLL( -# counts = counts, -# z = z, -# phi = phi, -# eta = eta, -# theta = theta -# ) - - max.divergence <- max(abs(theta.previous - theta)) - if (max.divergence < convergence) { - converged <- TRUE - } - theta.previous <- theta - - ## Calculate likelihood and check for convergence - if (iter %% iterLogLik == 0 || converged) { - - llTemp <- decontXLogLik( - counts = counts, - z = z, - phi = phi, - eta = eta, - theta = theta, - pseudocount = 1e-20) - - ll <- c(ll, llTemp) -# llRound <- c(llRound, round(llTemp, 2)) - -# if (round(llTemp, 2) > llRound[iter] | iter == 1) { -# numIterWithoutImprovement <- 1L -# } else { -# numIterWithoutImprovement <- numIterWithoutImprovement + 1L -# } - - .logMessages(date(), - "...... Completed iteration:", - iter, - "| converge:", - signif(max.divergence, 4), - logfile = logfile, - append = TRUE, - verbose = verbose) - } - - iter <- iter + 1L - - } - } + ## EM updates + theta.previous <- theta + converged <- FALSE + counts.colsums <- Matrix::colSums(counts) + while (iter <= maxIter & !isTRUE(converged) & + numIterWithoutImprovement <= stopIter) { + # nextDecon <- .cDCalcEMDecontamination( + # counts = counts, + # phi = phi, + # eta = eta, + # theta = theta, + # z = z, + # K = K, + # delta = delta + # ) + nextDecon <- decontXEM( + counts = counts, + counts_colsums = counts.colsums, + phi = phi, + eta = eta, + theta = theta, + z = z, + pseudocount = 1e-20 + ) + + theta <- nextDecon$theta + phi <- nextDecon$phi + eta <- nextDecon$eta + delta <- nextDecon$delta + + ## Calculate log-likelihood + # llTemp <- .deconCalcLL( + # counts = counts, + # z = z, + # phi = phi, + # eta = eta, + # theta = theta + # ) + + max.divergence <- max(abs(theta.previous - theta)) + if (max.divergence < convergence) { + converged <- TRUE + } + theta.previous <- theta + + ## Calculate likelihood and check for convergence + if (iter %% iterLogLik == 0 || converged) { + llTemp <- decontXLogLik( + counts = counts, + z = z, + phi = phi, + eta = eta, + theta = theta, + pseudocount = 1e-20 + ) -# resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) - resConp <- nextDecon$contamination - names(resConp) <- colnames(counts) + ll <- c(ll, llTemp) + # llRound <- c(llRound, round(llTemp, 2)) + + # if (round(llTemp, 2) > llRound[iter] | iter == 1) { + # numIterWithoutImprovement <- 1L + # } else { + # numIterWithoutImprovement <- numIterWithoutImprovement + 1L + # } + + .logMessages(date(), + "...... Completed iteration:", + iter, + "| converge:", + signif(max.divergence, 4), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + } - return(list( - "logLikelihood" = ll, - "contamination" = resConp, - "theta" = theta, - "delta" = delta, - "phi" = phi, - "eta" = eta, - "UMAP" = umap, - "iteration" = iter - 1L, - "z" = z - )) + iter <- iter + 1L + } + } + + # resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) + resConp <- nextDecon$contamination + names(resConp) <- colnames(counts) + + return(list( + "logLikelihood" = ll, + "contamination" = resConp, + "theta" = theta, + "delta" = delta, + "phi" = phi, + "eta" = eta, + "UMAP" = umap, + "iteration" = iter - 1L, + "z" = z + )) } @@ -722,11 +750,11 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), # populations # theta Numeric vector. Proportion of truely expressed transcripts .deconCalcLL <- function(counts, z, phi, eta, theta) { - # ll = sum( t(counts) * log( (1-conP )*geneDist[z,] + conP * conDist[z, ] + - # 1e-20 ) ) # when dist_mat are K x G matrices - ll <- sum(Matrix::t(counts) * log(theta * t(phi)[z, ] + - (1 - theta) * t(eta)[z, ] + 1e-20)) - return(ll) + # ll = sum( t(counts) * log( (1-conP )*geneDist[z,] + conP * conDist[z, ] + + # 1e-20 ) ) # when dist_mat are K x G matrices + ll <- sum(Matrix::t(counts) * log(theta * t(phi)[z, ] + + (1 - theta) * t(eta)[z, ] + 1e-20)) + return(ll) } # DEPRECATED. This is not used, but is kept as it might be useful in the future @@ -735,11 +763,11 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), # bgDist Numeric matrix. Rows represent feature and columns are the times that # the background-distribution has been replicated. .bgCalcLL <- function(counts, globalZ, cbZ, phi, eta, theta) { - # ll <- sum(t(counts) * log(theta * t(cellDist) + - # (1 - theta) * t(bgDist) + 1e-20)) - ll <- sum(t(counts) * log(theta * t(phi)[cbZ, ] + - (1 - theta) * t(eta)[globalZ, ] + 1e-20)) - return(ll) + # ll <- sum(t(counts) * log(theta * t(cellDist) + + # (1 - theta) * t(bgDist) + 1e-20)) + ll <- sum(t(counts) * log(theta * t(phi)[cbZ, ] + + (1 - theta) * t(eta)[globalZ, ] + 1e-20)) + return(ll) } @@ -751,92 +779,98 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), # theta Numeric vector. Proportion of truely expressed transctripts #' @importFrom MCMCprecision fit_dirichlet .cDCalcEMDecontamination <- function(counts, - phi, - eta, - theta, - z, - K, - delta) { - ## Notes: use fix-point iteration to update prior for theta, no need - ## to feed delta anymore - - logPr <- log(t(phi)[z, ] + 1e-20) + log(theta + 1e-20) - logPc <- log(t(eta)[z, ] + 1e-20) + log(1 - theta + 1e-20) - Pr.e <- exp(logPr) - Pc.e <- exp(logPc) - Pr <- Pr.e / (Pr.e + Pc.e) + phi, + eta, + theta, + z, + K, + delta) { + ## Notes: use fix-point iteration to update prior for theta, no need + ## to feed delta anymore + + logPr <- log(t(phi)[z, ] + 1e-20) + log(theta + 1e-20) + logPc <- log(t(eta)[z, ] + 1e-20) + log(1 - theta + 1e-20) + Pr.e <- exp(logPr) + Pc.e <- exp(logPc) + Pr <- Pr.e / (Pr.e + Pc.e) + + estRmat <- t(Pr) * counts + rnGByK <- .colSumByGroupNumeric(estRmat, z, K) + cnGByK <- rowSums(rnGByK) - rnGByK + + counts.cs <- colSums(counts) + estRmat.cs <- colSums(estRmat) + estRmat.cs.n <- estRmat.cs / counts.cs + estCmat.cs.n <- 1 - estRmat.cs.n + temp <- cbind(estRmat.cs.n, estCmat.cs.n) + deltaV2 <- MCMCprecision::fit_dirichlet(temp)$alpha + + ## Update parameters + theta <- + (estRmat.cs + deltaV2[1]) / (counts.cs + sum(deltaV2)) + phi <- normalizeCounts(rnGByK, + normalize = "proportion", + pseudocountNormalize = 1e-20 + ) + eta <- normalizeCounts(cnGByK, + normalize = "proportion", + pseudocountNormalize = 1e-20 + ) + + return(list( + "estRmat" = estRmat, + "theta" = theta, + "phi" = phi, + "eta" = eta, + "delta" = deltaV2 + )) +} - estRmat <- t(Pr) * counts - rnGByK <- .colSumByGroupNumeric(estRmat, z, K) - cnGByK <- rowSums(rnGByK) - rnGByK +# DEPRECATED. This is not used, but is kept as it might be useful in the +# feature. +# This function updates decontamination using background distribution +.cDCalcEMbgDecontamination <- + function(counts, globalZ, cbZ, trZ, phi, eta, theta) { + logPr <- log(t(phi)[cbZ, ] + 1e-20) + log(theta + 1e-20) + logPc <- + log(t(eta)[globalZ, ] + 1e-20) + log(1 - theta + 1e-20) - counts.cs <- colSums(counts) - estRmat.cs <- colSums(estRmat) - estRmat.cs.n <- estRmat.cs / counts.cs - estCmat.cs.n <- 1 - estRmat.cs.n - temp <- cbind(estRmat.cs.n, estCmat.cs.n) - deltaV2 <- MCMCprecision::fit_dirichlet(temp)$alpha + Pr <- exp(logPr) / (exp(logPr) + exp(logPc)) + Pc <- 1 - Pr + deltaV2 <- + MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha - ## Update parameters + estRmat <- t(Pr) * counts + phiUnnormalized <- + .colSumByGroupNumeric(estRmat, cbZ, max(cbZ)) + etaUnnormalized <- + rowSums(phiUnnormalized) - .colSumByGroupNumeric( + phiUnnormalized, + trZ, max(trZ) + ) + + ## Update paramters theta <- - (estRmat.cs + deltaV2[1]) / (counts.cs + sum(deltaV2)) - phi <- normalizeCounts(rnGByK, + (colSums(estRmat) + deltaV2[1]) / (colSums(counts) + sum(deltaV2)) + phi <- + normalizeCounts(phiUnnormalized, normalize = "proportion", - pseudocountNormalize = 1e-20) - eta <- normalizeCounts(cnGByK, + pseudocountNormalize = 1e-20 + ) + eta <- + normalizeCounts(etaUnnormalized, normalize = "proportion", - pseudocountNormalize = 1e-20) + pseudocountNormalize = 1e-20 + ) return(list( - "estRmat" = estRmat, - "theta" = theta, - "phi" = phi, - "eta" = eta, - "delta" = deltaV2 + "estRmat" = estRmat, + "theta" = theta, + "phi" = phi, + "eta" = eta, + "delta" = deltaV2 )) -} - -# DEPRECATED. This is not used, but is kept as it might be useful in the -# feature. -# This function updates decontamination using background distribution -.cDCalcEMbgDecontamination <- - function(counts, globalZ, cbZ, trZ, phi, eta, theta) { - logPr <- log(t(phi)[cbZ, ] + 1e-20) + log(theta + 1e-20) - logPc <- - log(t(eta)[globalZ, ] + 1e-20) + log(1 - theta + 1e-20) - - Pr <- exp(logPr) / (exp(logPr) + exp(logPc)) - Pc <- 1 - Pr - deltaV2 <- - MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha - - estRmat <- t(Pr) * counts - phiUnnormalized <- - .colSumByGroupNumeric(estRmat, cbZ, max(cbZ)) - etaUnnormalized <- - rowSums(phiUnnormalized) - .colSumByGroupNumeric(phiUnnormalized, - trZ, max(trZ)) - - ## Update paramters - theta <- - (colSums(estRmat) + deltaV2[1]) / (colSums(counts) + sum(deltaV2)) - phi <- - normalizeCounts(phiUnnormalized, - normalize = "proportion", - pseudocountNormalize = 1e-20) - eta <- - normalizeCounts(etaUnnormalized, - normalize = "proportion", - pseudocountNormalize = 1e-20) - - return(list( - "estRmat" = estRmat, - "theta" = theta, - "phi" = phi, - "eta" = eta, - "delta" = deltaV2 - )) -} + } @@ -844,199 +878,210 @@ setReplaceMethod("decontXcounts", c("SingleCellExperiment", "ANY"), ## Make sure provided parameters are the right type and value range .checkParametersDecon <- function(proportionPrior) { - if (length(proportionPrior) > 1 | any(proportionPrior <= 0)) { - stop("'delta' should be a single positive value.") - } + if (length(proportionPrior) > 1 | any(proportionPrior <= 0)) { + stop("'delta' should be a single positive value.") + } } ## Make sure provided count matrix is the right type .checkCountsDecon <- function(counts) { - if (sum(is.na(counts)) > 0) { - stop("Missing value in 'counts' matrix.") - } - if (is.null(dim(counts))) { - stop("At least 2 genes need to have non-zero expressions.") - } + if (sum(is.na(counts)) > 0) { + stop("Missing value in 'counts' matrix.") + } + if (is.null(dim(counts))) { + stop("At least 2 genes need to have non-zero expressions.") + } } ## Make sure provided cell labels are the right type #' @importFrom plyr mapvalues .processCellLabels <- function(z, numCells) { - if (length(z) != numCells) { - stop("'z' must be of the same length as the number of cells in the", - " 'counts' matrix.") - } - if (length(unique(z)) < 2) { - stop("No need to decontaminate when only one cluster", - " is in the dataset.") # Even though - # everything runs smoothly when length(unique(z)) == 1, result is not - # trustful - } - if (!is.factor(z)) { - z <- plyr::mapvalues(z, unique(z), seq(length(unique(z)))) - z <- as.factor(z) - } - return(z) + if (length(z) != numCells) { + stop( + "'z' must be of the same length as the number of cells in the", + " 'counts' matrix." + ) + } + if (length(unique(z)) < 2) { + stop( + "No need to decontaminate when only one cluster", + " is in the dataset." + ) # Even though + # everything runs smoothly when length(unique(z)) == 1, result is not + # trustful + } + if (!is.factor(z)) { + z <- plyr::mapvalues(z, unique(z), seq(length(unique(z)))) + z <- as.factor(z) + } + return(z) } ## Add two (veried-length) vectors of logLikelihood addLogLikelihood <- function(llA, llB) { - lengthA <- length(llA) - lengthB <- length(llB) - - if (lengthA >= lengthB) { - llB <- c(llB, rep(llB[lengthB], lengthA - lengthB)) - ll <- llA + llB - } else { - llA <- c(llA, rep(llA[lengthA], lengthB - lengthA)) - ll <- llA + llB - } - - return(ll) + lengthA <- length(llA) + lengthB <- length(llB) + + if (lengthA >= lengthB) { + llB <- c(llB, rep(llB[lengthB], lengthA - lengthB)) + ll <- llA + llB + } else { + llA <- c(llA, rep(llA[lengthA], lengthB - lengthA)) + ll <- llA + llB + } + + return(ll) } ## Initialization of cell labels for DecontX when they are not given .decontxInitializeZ <- - function(object, # object is either a sce object or a count matrix - varGenes = 5000, - L = 50, - dbscanEps = 1.0, - verbose = TRUE, - logfile = NULL) { - - if (!is(object, "SingleCellExperiment")) { - sce <- SingleCellExperiment::SingleCellExperiment(assays = - list(counts = object)) - } + function(object, # object is either a sce object or a count matrix + varGenes = 5000, + L = 50, + dbscanEps = 1.0, + verbose = TRUE, + logfile = NULL) { + if (!is(object, "SingleCellExperiment")) { + sce <- SingleCellExperiment::SingleCellExperiment( + assays = + list(counts = object) + ) + } - ## Add the log2 normalized counts into sce object - ## The normalized counts is also centered using library size in the - ## original count matrix in scater::normalizeSCE() - #sce <- suppressWarnings(scater::normalizeSCE(sce)) - sce <- scater::logNormCounts(sce, log = TRUE) - - if (nrow(sce) <= varGenes) { - topVariableGenes <- seq_len(nrow(sce)) - } else if (nrow(sce) > varGenes) { - ## Use the top most variable genes to do rough clustering - ## (celda_CG & Louvian graph algorithm) - #mvTrend <- scran::trendVar(sce, use.spikes = FALSE) - #decomposeTrend <- scran::decomposeVar(sce, mvTrend) - #topVariableGenes <- order(decomposeTrend$bio, - # decreasing = TRUE)[seq(varGenes)] - - sce.var <- scran::modelGeneVar(sce) - topVariableGenes <- order(sce.var$bio, - decreasing = TRUE)[seq(varGenes)] - } - countsFiltered <- as.matrix(SingleCellExperiment::counts( - sce[topVariableGenes, ])) - storage.mode(countsFiltered) <- "integer" - - .logMessages( - date(), - "...... Collapsing features into", - L, - "modules", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - ## Celda clustering using recursive module splitting - L <- min(L, nrow(countsFiltered)) - initialModuleSplit <- recursiveSplitModule(countsFiltered, - initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) - initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) - - #if (L < nrow(countsFiltered)) { - # initialModuleSplit <- recursiveSplitModule(countsFiltered, - # initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) - # initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) - # fm <- factorizeMatrix(countsFiltered, initialModel, type = "counts") - # fm <- fm$counts$cell - # rm(initialModuleSplit) - # rm(initialModel) - #} else { - # fm <- countsFiltered - #} - - .logMessages( - date(), - "...... Reducing dimensionality with UMAP", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - ## Louvan graph-based method to reduce dimension into 2 cluster - nNeighbors <- min(15, ncol(countsFiltered)) - #resUmap <- uwot::umap(t(sqrt(fm)), n_neighbors = nNeighbors, - # min_dist = 0.01, spread = 1) - #rm(fm) - resUmap <- celdaUmap(countsFiltered, initialModel, - minDist = 0.01, spread = 1, nNeighbors = nNeighbors) - - .logMessages( - date(), - " ...... Determining cell clusters with DBSCAN (Eps=", - dbscanEps, - ")", - sep = "", - logfile = logfile, - append = TRUE, - verbose = verbose - ) - # Use dbSCAN on the UMAP to identify broad cell types - totalClusters <- 1 - while (totalClusters <= 1 & dbscanEps > 0) { - resDbscan <- dbscan::dbscan(resUmap, dbscanEps) - dbscanEps <- dbscanEps - (0.25 * dbscanEps) - totalClusters <- length(unique(resDbscan$cluster)) - } + ## Add the log2 normalized counts into sce object + ## The normalized counts is also centered using library size in the + ## original count matrix in scater::normalizeSCE() + # sce <- suppressWarnings(scater::normalizeSCE(sce)) + sce <- scater::logNormCounts(sce, log = TRUE) + + if (nrow(sce) <= varGenes) { + topVariableGenes <- seq_len(nrow(sce)) + } else if (nrow(sce) > varGenes) { + ## Use the top most variable genes to do rough clustering + ## (celda_CG & Louvian graph algorithm) + # mvTrend <- scran::trendVar(sce, use.spikes = FALSE) + # decomposeTrend <- scran::decomposeVar(sce, mvTrend) + # topVariableGenes <- order(decomposeTrend$bio, + # decreasing = TRUE)[seq(varGenes)] + + sce.var <- scran::modelGeneVar(sce) + topVariableGenes <- order(sce.var$bio, + decreasing = TRUE + )[seq(varGenes)] + } + countsFiltered <- as.matrix(SingleCellExperiment::counts( + sce[topVariableGenes, ] + )) + storage.mode(countsFiltered) <- "integer" + + .logMessages( + date(), + "...... Collapsing features into", + L, + "modules", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + ## Celda clustering using recursive module splitting + L <- min(L, nrow(countsFiltered)) + initialModuleSplit <- recursiveSplitModule(countsFiltered, + initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE + ) + initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) + + # if (L < nrow(countsFiltered)) { + # initialModuleSplit <- recursiveSplitModule(countsFiltered, + # initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) + # initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) + # fm <- factorizeMatrix(countsFiltered, initialModel, type = "counts") + # fm <- fm$counts$cell + # rm(initialModuleSplit) + # rm(initialModel) + # } else { + # fm <- countsFiltered + # } + + .logMessages( + date(), + "...... Reducing dimensionality with UMAP", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + ## Louvan graph-based method to reduce dimension into 2 cluster + nNeighbors <- min(15, ncol(countsFiltered)) + # resUmap <- uwot::umap(t(sqrt(fm)), n_neighbors = nNeighbors, + # min_dist = 0.01, spread = 1) + # rm(fm) + resUmap <- celdaUmap(countsFiltered, initialModel, + minDist = 0.01, spread = 1, nNeighbors = nNeighbors + ) - return(list("z" = resDbscan$cluster, - "umap" = resUmap)) + .logMessages( + date(), + " ...... Determining cell clusters with DBSCAN (Eps=", + dbscanEps, + ")", + sep = "", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + # Use dbSCAN on the UMAP to identify broad cell types + totalClusters <- 1 + while (totalClusters <= 1 & dbscanEps > 0) { + resDbscan <- dbscan::dbscan(resUmap, dbscanEps) + dbscanEps <- dbscanEps - (0.25 * dbscanEps) + totalClusters <- length(unique(resDbscan$cluster)) } + return(list( + "z" = resDbscan$cluster, + "umap" = resUmap + )) + } + ## process varGenes .processvarGenes <- function(varGenes) { - if (is.null(varGenes)) { - varGenes <- 5000 - } else { - if (varGenes < 2 | length(varGenes) > 1) { - stop("Parameter 'varGenes' must be an integer larger than 1.") - } + if (is.null(varGenes)) { + varGenes <- 5000 + } else { + if (varGenes < 2 | length(varGenes) > 1) { + stop("Parameter 'varGenes' must be an integer larger than 1.") } - return(varGenes) + } + return(varGenes) } ## process dbscanEps for resolusion threshold using DBSCAN .processdbscanEps <- function(dbscanEps) { - if (is.null(dbscanEps)) { - dbscanEps <- 1 - } else { - if (dbscanEps < 0) { - stop("Parameter 'dbscanEps' needs to be non-negative.") - } + if (is.null(dbscanEps)) { + dbscanEps <- 1 + } else { + if (dbscanEps < 0) { + stop("Parameter 'dbscanEps' needs to be non-negative.") } - return(dbscanEps) + } + return(dbscanEps) } ## process gene modules L .processL <- function(L) { - if (is.null(L)) { - L <- 50 - } else { - if (L < 2 | length(L) > 1) { - stop("Parameter 'L' must be an integer larger than 1.") - } + if (is.null(L)) { + L <- 50 + } else { + if (L < 2 | length(L) > 1) { + stop("Parameter 'L' must be an integer larger than 1.") } - return(L) + } + return(L) } @@ -1072,104 +1117,115 @@ addLogLikelihood <- function(llA, llB) { #' contaminationSim <- simulateContaminatedMatrix(K = 3, delta = 1) #' @export simulateContaminatedMatrix <- function(C = 300, - G = 100, - K = 3, - NRange = c(500, 1000), - beta = 0.5, - delta = c(1, 2), - seed = 12345) { - - if (is.null(seed)) { - res <- .simulateContaminatedMatrix(C = C, - G = G, - K = K, - NRange = NRange, - beta = beta, - delta = delta) - } else { - with_seed(seed, - res <- .simulateContaminatedMatrix(C = C, - G = G, - K = K, - NRange = NRange, - beta = beta, - delta = delta)) - } + G = 100, + K = 3, + NRange = c(500, 1000), + beta = 0.5, + delta = c(1, 2), + seed = 12345) { + if (is.null(seed)) { + res <- .simulateContaminatedMatrix( + C = C, + G = G, + K = K, + NRange = NRange, + beta = beta, + delta = delta + ) + } else { + with_seed( + seed, + res <- .simulateContaminatedMatrix( + C = C, + G = G, + K = K, + NRange = NRange, + beta = beta, + delta = delta + ) + ) + } - return(res) + return(res) } .simulateContaminatedMatrix <- function(C = 300, - G = 100, - K = 3, - NRange = c(500, 1000), - beta = 0.5, - delta = c(1, 2)) { - if (length(delta) == 1) { - cpByC <- stats::rbeta(n = C, - shape1 = delta, - shape2 = delta) - } else { - cpByC <- stats::rbeta(n = C, - shape1 = delta[1], - shape2 = delta[2]) - } - - z <- sample(seq(K), size = C, replace = TRUE) - if (length(unique(z)) < K) { - warning( - "Only ", - length(unique(z)), - " clusters are simulated. Try to increase numebr of cells 'C' if", - " more clusters are needed" - ) - K <- length(unique(z)) - z <- plyr::mapvalues(z, unique(z), seq(length(unique(z)))) - } - - NbyC <- sample(seq(min(NRange), max(NRange)), - size = C, - replace = TRUE) - cNbyC <- vapply(seq(C), function(i) { - stats::rbinom(n = 1, - size = NbyC[i], - p = cpByC[i]) - }, integer(1)) - rNbyC <- NbyC - cNbyC - - phi <- .rdirichlet(K, rep(beta, G)) - - ## sample real expressed count matrix - cellRmat <- vapply(seq(C), function(i) { - stats::rmultinom(1, size = rNbyC[i], prob = phi[z[i], ]) - }, integer(G)) - - rownames(cellRmat) <- paste0("Gene_", seq(G)) - colnames(cellRmat) <- paste0("Cell_", seq(C)) - - ## sample contamination count matrix - nGByK <- - rowSums(cellRmat) - .colSumByGroup(cellRmat, group = z, K = K) - eta <- normalizeCounts(counts = nGByK, normalize = "proportion") - - cellCmat <- vapply(seq(C), function(i) { - stats::rmultinom(1, size = cNbyC[i], prob = eta[, z[i]]) - }, integer(G)) - cellOmat <- cellRmat + cellCmat - - rownames(cellOmat) <- paste0("Gene_", seq(G)) - colnames(cellOmat) <- paste0("Cell_", seq(C)) - - return( - list( - "nativeCounts" = cellRmat, - "observedCounts" = cellOmat, - "NByC" = NbyC, - "z" = z, - "eta" = eta, - "phi" = t(phi) - ) + G = 100, + K = 3, + NRange = c(500, 1000), + beta = 0.5, + delta = c(1, 2)) { + if (length(delta) == 1) { + cpByC <- stats::rbeta( + n = C, + shape1 = delta, + shape2 = delta + ) + } else { + cpByC <- stats::rbeta( + n = C, + shape1 = delta[1], + shape2 = delta[2] ) + } + + z <- sample(seq(K), size = C, replace = TRUE) + if (length(unique(z)) < K) { + warning( + "Only ", + length(unique(z)), + " clusters are simulated. Try to increase numebr of cells 'C' if", + " more clusters are needed" + ) + K <- length(unique(z)) + z <- plyr::mapvalues(z, unique(z), seq(length(unique(z)))) + } + + NbyC <- sample(seq(min(NRange), max(NRange)), + size = C, + replace = TRUE + ) + cNbyC <- vapply(seq(C), function(i) { + stats::rbinom( + n = 1, + size = NbyC[i], + p = cpByC[i] + ) + }, integer(1)) + rNbyC <- NbyC - cNbyC + + phi <- .rdirichlet(K, rep(beta, G)) + + ## sample real expressed count matrix + cellRmat <- vapply(seq(C), function(i) { + stats::rmultinom(1, size = rNbyC[i], prob = phi[z[i], ]) + }, integer(G)) + + rownames(cellRmat) <- paste0("Gene_", seq(G)) + colnames(cellRmat) <- paste0("Cell_", seq(C)) + + ## sample contamination count matrix + nGByK <- + rowSums(cellRmat) - .colSumByGroup(cellRmat, group = z, K = K) + eta <- normalizeCounts(counts = nGByK, normalize = "proportion") + + cellCmat <- vapply(seq(C), function(i) { + stats::rmultinom(1, size = cNbyC[i], prob = eta[, z[i]]) + }, integer(G)) + cellOmat <- cellRmat + cellCmat + + rownames(cellOmat) <- paste0("Gene_", seq(G)) + colnames(cellOmat) <- paste0("Cell_", seq(C)) + + return( + list( + "nativeCounts" = cellRmat, + "observedCounts" = cellOmat, + "NByC" = NbyC, + "z" = z, + "eta" = eta, + "phi" = t(phi) + ) + ) } - From 01899de7007a3d1ba6a2c2a01a3bc44fd704f277 Mon Sep 17 00:00:00 2001 From: "Joshua D. Campbell" Date: Thu, 9 Jan 2020 21:14:09 -0500 Subject: [PATCH 146/149] Removed old commented code --- R/decon.R | 109 +----------------------------------------------------- 1 file changed, 1 insertion(+), 108 deletions(-) diff --git a/R/decon.R b/R/decon.R index 857bd8b7..4cab952a 100644 --- a/R/decon.R +++ b/R/decon.R @@ -411,21 +411,9 @@ setReplaceMethod( returnZ[batch == bat] <- res$z } - # if (is.null(logLikelihood)) { - # logLikelihood <- resBat$resList$logLikelihood - # } else { - # logLikelihood <- addLogLikelihood(logLikelihood, - # resBat$resList$logLikelihood) - # } } names(resBatch) <- batchIndex - # runParams <- res$runParams - ## All batches share the same other parameters except cluster label z - ## So update z in the final returned result - # runParams$z <- returnZ - # method <- res$method - returnResult <- list( "runParams" = runParams, "estimates" = resBatch, @@ -470,35 +458,6 @@ setReplaceMethod( ) } - - # } else { ## When there is only one batch - # returnResult <- .decontXoneBatch( - # counts = counts, - # z = z, - # maxIter = maxIter, - # delta = delta, - # convergence = convergence, - # logfile = logfile, - # verbose = verbose, - # varGenes = varGenes, - # dbscanEps = dbscanEps, - # L = L - # ) - - # estRmat <- calculateNativeMatrix( - # counts = counts, - # native_counts = estRmat, - # theta = returnResult$theta, - # eta = returnResult$eta, - # row_index = which(noneEmptyGeneIndex), - # col_index = seq(totalCells), - # phi = returnResult$phi, - # z = as.integer(returnResult$runParams$z), - # pseudocount = 1e-20) - - # returnResult$decontX_counts <- estRmat - # } - endTime <- Sys.time() .logMessages(paste(rep("-", 50), collapse = ""), logfile = logfile, @@ -593,7 +552,6 @@ setReplaceMethod( if (deconMethod == "clustering") { ## Initialization deltaInit <- delta - # theta = stats::runif(nC, min = 0.1, max = 0.5) theta <- stats::rbeta( n = nC, shape1 = deltaInit, @@ -610,25 +568,7 @@ setReplaceMethod( phi <- nextDecon$phi eta <- nextDecon$eta - # estRmat <- Matrix::t(Matrix::t(counts) * theta) - # phi <- .colSumByGroupNumeric(as.matrix(estRmat), z, K) - # eta <- rowSums(phi) - phi - # phi <- normalizeCounts(phi, - # normalize = "proportion", - # pseudocountNormalize = 1e-20) - # eta <- normalizeCounts(eta, - # normalize = "proportion", - # pseudocountNormalize = 1e-20) ll <- c() - - - # llRound <- .deconCalcLL( - # counts = counts, - # z = z, - # phi = phi, - # eta = eta, - # theta = theta - # ) llRound <- decontXLogLik( counts = counts, z = z, @@ -644,15 +584,7 @@ setReplaceMethod( counts.colsums <- Matrix::colSums(counts) while (iter <= maxIter & !isTRUE(converged) & numIterWithoutImprovement <= stopIter) { - # nextDecon <- .cDCalcEMDecontamination( - # counts = counts, - # phi = phi, - # eta = eta, - # theta = theta, - # z = z, - # K = K, - # delta = delta - # ) + nextDecon <- decontXEM( counts = counts, counts_colsums = counts.colsums, @@ -668,15 +600,6 @@ setReplaceMethod( eta <- nextDecon$eta delta <- nextDecon$delta - ## Calculate log-likelihood - # llTemp <- .deconCalcLL( - # counts = counts, - # z = z, - # phi = phi, - # eta = eta, - # theta = theta - # ) - max.divergence <- max(abs(theta.previous - theta)) if (max.divergence < convergence) { converged <- TRUE @@ -695,13 +618,6 @@ setReplaceMethod( ) ll <- c(ll, llTemp) - # llRound <- c(llRound, round(llTemp, 2)) - - # if (round(llTemp, 2) > llRound[iter] | iter == 1) { - # numIterWithoutImprovement <- 1L - # } else { - # numIterWithoutImprovement <- numIterWithoutImprovement + 1L - # } .logMessages(date(), "...... Completed iteration:", @@ -953,22 +869,11 @@ addLogLikelihood <- function(llA, llB) { ) } - ## Add the log2 normalized counts into sce object - ## The normalized counts is also centered using library size in the - ## original count matrix in scater::normalizeSCE() - # sce <- suppressWarnings(scater::normalizeSCE(sce)) sce <- scater::logNormCounts(sce, log = TRUE) if (nrow(sce) <= varGenes) { topVariableGenes <- seq_len(nrow(sce)) } else if (nrow(sce) > varGenes) { - ## Use the top most variable genes to do rough clustering - ## (celda_CG & Louvian graph algorithm) - # mvTrend <- scran::trendVar(sce, use.spikes = FALSE) - # decomposeTrend <- scran::decomposeVar(sce, mvTrend) - # topVariableGenes <- order(decomposeTrend$bio, - # decreasing = TRUE)[seq(varGenes)] - sce.var <- scran::modelGeneVar(sce) topVariableGenes <- order(sce.var$bio, decreasing = TRUE @@ -995,18 +900,6 @@ addLogLikelihood <- function(llA, llB) { ) initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) - # if (L < nrow(countsFiltered)) { - # initialModuleSplit <- recursiveSplitModule(countsFiltered, - # initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) - # initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) - # fm <- factorizeMatrix(countsFiltered, initialModel, type = "counts") - # fm <- fm$counts$cell - # rm(initialModuleSplit) - # rm(initialModel) - # } else { - # fm <- countsFiltered - # } - .logMessages( date(), "...... Reducing dimensionality with UMAP", From bf83c0c2866d23a993cb1017ed1386884c001db6 Mon Sep 17 00:00:00 2001 From: syyang Date: Sun, 12 Jan 2020 22:12:04 -0500 Subject: [PATCH 147/149] .decontxInitializeZ passes seed for reproducibility --- R/decon.R | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/R/decon.R b/R/decon.R index 4cab952a..2b780345 100644 --- a/R/decon.R +++ b/R/decon.R @@ -359,7 +359,8 @@ setReplaceMethod( verbose = verbose, varGenes = varGenes, dbscanEps = dbscanEps, - L = L + L = L, + seed = seed ) } else { withr::with_seed( @@ -376,7 +377,8 @@ setReplaceMethod( verbose = verbose, varGenes = varGenes, dbscanEps = dbscanEps, - L = L + L = L, + seed = seed ) ) } @@ -481,6 +483,8 @@ setReplaceMethod( # This function updates decontamination for one batch +# seed passed to this function is to be furhter passed to +# function .decontxInitializeZ() .decontXoneBatch <- function(counts, z = NULL, batch = NULL, @@ -492,7 +496,8 @@ setReplaceMethod( verbose = TRUE, varGenes = NULL, dbscanEps = NULL, - L = NULL) { + L = NULL, + seed = 12345) { .checkCountsDecon(counts) .checkParametersDecon(proportionPrior = delta) @@ -523,6 +528,7 @@ setReplaceMethod( L = L, dbscanEps = dbscanEps, verbose = verbose, + seed = seed, logfile = logfile ) z <- celda.init$z @@ -861,6 +867,7 @@ addLogLikelihood <- function(llA, llB) { L = 50, dbscanEps = 1.0, verbose = TRUE, + seed = 12345, logfile = NULL) { if (!is(object, "SingleCellExperiment")) { sce <- SingleCellExperiment::SingleCellExperiment( @@ -870,6 +877,7 @@ addLogLikelihood <- function(llA, llB) { } sce <- scater::logNormCounts(sce, log = TRUE) + #sce <- scater::normalize(sce) if (nrow(sce) <= varGenes) { topVariableGenes <- seq_len(nrow(sce)) @@ -895,9 +903,13 @@ addLogLikelihood <- function(llA, llB) { ) ## Celda clustering using recursive module splitting L <- min(L, nrow(countsFiltered)) + if (is.null(seed)) { initialModuleSplit <- recursiveSplitModule(countsFiltered, - initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE - ) + initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) + } else { + with_seed(seed, initialModuleSplit <- recursiveSplitModule(countsFiltered, + initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) + )} initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) .logMessages( @@ -913,7 +925,7 @@ addLogLikelihood <- function(llA, llB) { # min_dist = 0.01, spread = 1) # rm(fm) resUmap <- celdaUmap(countsFiltered, initialModel, - minDist = 0.01, spread = 1, nNeighbors = nNeighbors + minDist = 0.01, spread = 1, nNeighbors = nNeighbors, seed = seed ) .logMessages( From ba13c0650587195f7ed30d420761c14de0d9a181 Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Mon, 13 Jan 2020 13:44:27 -0500 Subject: [PATCH 148/149] fix lints --- R/decon.R | 5 ++--- man/decontX.Rd | 14 +++++++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/R/decon.R b/R/decon.R index 2b780345..8ed0fe29 100644 --- a/R/decon.R +++ b/R/decon.R @@ -449,8 +449,7 @@ setReplaceMethod( returnResult$decontXcounts <- DelayedArray::DelayedArray(returnResult$decontXcounts) } else { - try( - { + try({ if (canCoerce(returnResult$decontXcounts, class(counts))) { returnResult$decontXcounts <- as(returnResult$decontXcounts, class(counts)) @@ -483,7 +482,7 @@ setReplaceMethod( # This function updates decontamination for one batch -# seed passed to this function is to be furhter passed to +# seed passed to this function is to be furhter passed to # function .decontxInitializeZ() .decontXoneBatch <- function(counts, z = NULL, diff --git a/man/decontX.Rd b/man/decontX.Rd index 29692266..3a5f3bad 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -50,7 +50,7 @@ of class \code{dgCMatrix} from package \link{Matrix} before analysis.} \item{z}{Numeric or character vector. Cell cluster labels. If NULL, Celda will be used to reduce the dimensionality of the dataset -to 'L' modules, '\link[uwot]{umap}' from the 'uwot' package +to 'L' modules, '\link[uwot]{umap}' from the 'uwot' package will be used to further reduce the dataset to 2 dimenions and the '\link[dbscan]{dbscan}' function from the 'dbscan' package will be used to identify clusters of broad cell types. Default NULL.} @@ -111,7 +111,7 @@ cluster labels will also be stored here.} } If \code{x} is a \linkS4class{SingleCellExperiment}, then the decontaminated -counts will be stored as an assay and can be accessed with +counts will be stored as an assay and can be accessed with \code{decontXcounts(x)}. The contamination values and cluster labels will be stored in \code{colData(x)}. \code{estimates} and \code{runParams} will be stored in \code{metadata(x)$decontX}. If z was not supplied, then @@ -123,9 +123,9 @@ Identifies contamination from factors such as ambient RNA in single cell genomic datasets. } \examples{ - s <- simulateContaminatedMatrix() - result <- decontX(s$observedCounts, s$z) - contamination <- colSums(s$observedCounts - s$nativeCounts) / - colSums(s$observedCounts) - plot(contamination, result$contamination) +s <- simulateContaminatedMatrix() +result <- decontX(s$observedCounts, s$z) +contamination <- colSums(s$observedCounts - s$nativeCounts) / + colSums(s$observedCounts) +plot(contamination, result$contamination) } From 5dcdc1f8d8001af13d79369edc430a5faa38014e Mon Sep 17 00:00:00 2001 From: 87875172 <314100793@qq.com> Date: Mon, 13 Jan 2020 14:08:11 -0500 Subject: [PATCH 149/149] fix lints --- R/decon.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/decon.R b/R/decon.R index 8ed0fe29..5e970439 100644 --- a/R/decon.R +++ b/R/decon.R @@ -876,7 +876,7 @@ addLogLikelihood <- function(llA, llB) { } sce <- scater::logNormCounts(sce, log = TRUE) - #sce <- scater::normalize(sce) + #sce <- scater::normalize(sce) if (nrow(sce) <= varGenes) { topVariableGenes <- seq_len(nrow(sce)) @@ -906,7 +906,7 @@ addLogLikelihood <- function(llA, llB) { initialModuleSplit <- recursiveSplitModule(countsFiltered, initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) } else { - with_seed(seed, initialModuleSplit <- recursiveSplitModule(countsFiltered, + with_seed(seed, initialModuleSplit <- recursiveSplitModule(countsFiltered, initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) )} initialModel <- subsetCeldaList(initialModuleSplit, list(L = L))