diff --git a/.travis.yml b/.travis.yml index 6d7e52e3..d0f4a664 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,4 +24,4 @@ r_github_packages: after_success: - R CMD INSTALL $PKG_TARBALL - Rscript -e 'covr::coveralls()' - - Rscript -e 'library(lintr); lint_package(linters = with_defaults(object_length_linter(length = 40L), object_name_linter = NULL, commented_code_linter = NULL, object_usage_linter = NULL), exclusions = list("R/RcppExports.R"))' \ No newline at end of file + - Rscript -e 'library(lintr); lint_package(linters = with_defaults(object_length_linter(length = 40L), object_name_linter = NULL, commented_code_linter = NULL, object_usage_linter = NULL, cyclocomp_linter(complexity_limit = 40)), exclusions = list("R/RcppExports.R"))' \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index b30655c6..4a7cc268 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,14 @@ Package: celda Title: CEllular Latent Dirichlet Allocation -Version: 1.1.2 -Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", role = c("aut", "cre")), - person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), - person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")), - person("Zhe", "Wang", email="zhe@bu.edu", role = c("aut"))) -Description: celda leverages Bayesian hierarchical modeling to cluster genes, - cells, or both simultaneously from single cell sequencing data. +Version: 1.3.1 +Authors@R: c(person("Joshua", "Campbell", email = "camp@bu.edu", + role = c("aut", "cre")), + person("Sean", "Corbett", email = "scorbett@bu.edu", role = c("aut")), + person("Yusuke", "Koga", email="ykoga07@bu.edu", role = c("aut")), + person("Shiyi", "Yang", email="syyang@bu.edu", role = c("aut")), + person("Eric", "Reed", email="reeder@bu.edu", role = c("aut")), + person("Zhe", "Wang", email="zhe@bu.edu", role = c("aut"))) +Description: celda is a Bayesian hierarchical model that can co-cluster features and cells in single cell sequencing data. Depends: R (>= 3.6) VignetteBuilder: knitr @@ -31,8 +33,9 @@ Imports: S4Vectors, data.table, Rcpp, + RcppArmadillo, RcppEigen, - umap, + uwot, enrichR, stringi, SummarizedExperiment, @@ -42,7 +45,12 @@ Imports: withr, dendextend, ggdendro, - pROC + pROC, + magrittr, + scater (>= 1.14.4), + scran, + SingleCellExperiment, + dbscan Suggests: testthat, knitr, @@ -55,10 +63,10 @@ Suggests: M3DExampleData, BiocManager, BiocStyle -LinkingTo: Rcpp, RcppEigen +LinkingTo: Rcpp, RcppArmadillo, RcppEigen License: MIT + file LICENSE Encoding: UTF-8 LazyData: false -RoxygenNote: 6.1.1 +RoxygenNote: 7.0.2 BugReports: https://github.com/campbio/celda/issues biocViews: SingleCell, GeneExpression, Clustering, Sequencing, Bayesian diff --git a/NAMESPACE b/NAMESPACE index 4270e170..b0030709 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,9 +1,9 @@ # Generated by roxygen2: do not edit by hand +export("decontXcounts<-") export(appendCeldaList) export(availableModels) export(bestLogLikelihood) -export(buildTreeHybrid) export(celda) export(celdaGridSearch) export(celdaHeatmap) @@ -19,11 +19,13 @@ export(clusters) export(compareCountMatrix) export(countChecksum) export(decontX) +export(decontXcounts) export(differentialExpression) export(distinctColors) export(factorizeMatrix) export(featureModuleLookup) export(featureModuleTable) +export(findMarkers) export(geneSetEnrich) export(getDecisions) export(logLikelihood) @@ -63,6 +65,7 @@ export(simulateContaminatedMatrix) export(subsetCeldaList) export(topRank) export(violinPlot) +exportMethods("decontXcounts<-") exportMethods(bestLogLikelihood) exportMethods(celdaHeatmap) exportMethods(celdaPerplexity) @@ -72,6 +75,8 @@ exportMethods(celdaUmap) exportMethods(clusterProbability) exportMethods(clusters) exportMethods(countChecksum) +exportMethods(decontX) +exportMethods(decontXcounts) exportMethods(factorizeMatrix) exportMethods(featureModuleLookup) exportMethods(logLikelihoodHistory) @@ -91,7 +96,9 @@ import(grDevices) import(graphics) import(grid) import(gridExtra, except = c(combine)) +import(magrittr) import(stats, except = c(start, end)) +import(uwot) importFrom(MAST,FromMatrix) importFrom(MAST,summary) importFrom(MAST,zlm) @@ -126,6 +133,7 @@ importFrom(gtable,gtable_height) importFrom(gtable,gtable_width) importFrom(matrixStats,logSumExp) importFrom(methods,.hasSlot) +importFrom(methods,hasArg) importFrom(methods,is) importFrom(methods,new) importFrom(pROC,auc) @@ -138,8 +146,6 @@ importFrom(scales,brewer_pal) importFrom(scales,dscale) importFrom(scales,hue_pal) importFrom(stringi,stri_list2matrix) -importFrom(umap,umap) -importFrom(umap,umap.defaults) importFrom(withr,with_seed) useDynLib(celda,"_colSumByGroup") useDynLib(celda,"_colSumByGroupChange") diff --git a/R/RcppExports.R b/R/RcppExports.R index 0a461013..eeaa6b79 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,10 +1,34 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +decontXEM <- function(counts, counts_colsums, theta, eta, phi, z, pseudocount) { + .Call('_celda_decontXEM', PACKAGE = 'celda', counts, counts_colsums, theta, eta, phi, z, pseudocount) +} + +decontXLogLik <- function(counts, theta, eta, phi, z, pseudocount) { + .Call('_celda_decontXLogLik', PACKAGE = 'celda', counts, theta, eta, phi, z, pseudocount) +} + +decontXInitialize <- function(counts, theta, z, pseudocount) { + .Call('_celda_decontXInitialize', PACKAGE = 'celda', counts, theta, z, pseudocount) +} + +calculateNativeMatrix <- function(counts, native_counts, theta, eta, phi, z, row_index, col_index, pseudocount) { + .Call('_celda_calculateNativeMatrix', PACKAGE = 'celda', counts, native_counts, theta, eta, phi, z, row_index, col_index, pseudocount) +} + cG_calcGibbsProbY_Simple <- function(counts, nGbyTS, nTSbyC, nbyTS, nbyG, y, L, index, gamma, beta, delta) { .Call('_celda_cG_calcGibbsProbY_Simple', PACKAGE = 'celda', counts, nGbyTS, nTSbyC, nbyTS, nbyG, y, L, index, gamma, beta, delta) } +cG_CalcGibbsProbY_ori <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { + .Call('_celda_cG_CalcGibbsProbY_ori', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) +} + +cG_CalcGibbsProbY_fastRow <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { + .Call('_celda_cG_CalcGibbsProbY_fastRow', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) +} + cG_CalcGibbsProbY <- function(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) { .Call('_celda_cG_CalcGibbsProbY', PACKAGE = 'celda', index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta) } diff --git a/R/all_generics.R b/R/all_generics.R index 3c66133c..f367468c 100755 --- a/R/all_generics.R +++ b/R/all_generics.R @@ -596,33 +596,26 @@ setGeneric("celdaTsne", #' requires more memory. Default 25000. #' @param minClusterSize Integer. Do not subsample cell clusters below this #' threshold. Default 100. -#' @param initialDims Integer. PCA will be used to reduce the dimentionality -#' of the dataset. The top 'initialDims' principal components will be used -#' for umap. Default 20. #' @param modules Integer vector. Determines which features modules to use for #' tSNE. If NULL, all modules will be used. Default NULL. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. -#' @param umapConfig An object of class "umapConfig" specifying parameters to #' the UMAP algorithm. -#' @return Numeric Matrix of dimension `ncol(counts)` x 2, colums representing -#' the "X" and "Y" coordinates in the data's t-SNE represetation. -#' @examples +#' @param ... Additional parameters to `uwot::umap` +#' @return A two column matrix of UMAP coordinates#' @examples #' data(celdaCGSim, celdaCGMod) -#' tsneRes <- celdaUmap(celdaCGSim$counts, celdaCGMod) -#' @importFrom umap umap.defaults +#' umapRes <- celdaUmap(celdaCGSim$counts, celdaCGMod) #' @export setGeneric("celdaUmap", signature = "celdaMod", function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, - initialDims = 20, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) { + ...) { standardGeneric("celdaUmap") }) diff --git a/R/buildTreeHelper.R b/R/buildTreeHelper.R index b6a8de70..f7f99efe 100644 --- a/R/buildTreeHelper.R +++ b/R/buildTreeHelper.R @@ -319,10 +319,10 @@ splitStats <- vapply( colnames(features), function(feat, features, class, splitMetric) { - splitMetric(feat, class, features, rPerf = T) + splitMetric(feat, class, features, rPerf = TRUE) }, features, class, splitMetric, FUN.VALUE = double(1)) names(splitStats) <- colnames(features) - splitStats <- sort(splitStats, decreasing = T) + splitStats <- sort(splitStats, decreasing = TRUE) return(splitStats) } @@ -412,7 +412,7 @@ featValues <- features[, feat] # Get order of values - ord <- order(featValues, decreasing = T) + ord <- order(featValues, decreasing = TRUE) # Get sorted class and values featValuesSort <- featValues[ord] @@ -516,7 +516,7 @@ featValues <- features[, feat] # Get order of values - ord <- order(featValues, decreasing = T) + ord <- order(featValues, decreasing = TRUE) # Get sorted class and values featValuesSort <- featValues[ord] @@ -835,7 +835,7 @@ stat = HM, stringsAsFactors = F)) }, .splitMetricModF1, fSub, cSub, group2only)) - altStats <- altStats[order(altStats$stat, decreasing = T), ] + altStats <- altStats[order(altStats$stat, decreasing = TRUE), ] # Get alternative splits splitStats <- altStats$stat[1] diff --git a/R/buildTreeHybrid.R b/R/buildTreeHybrid.R deleted file mode 100644 index 614be6f2..00000000 --- a/R/buildTreeHybrid.R +++ /dev/null @@ -1,128 +0,0 @@ -#' @title Generate decision tree from single-cell clustering output. -#' @description Uses decision tree procudure to generate a set of rules for each -#' cell cluster defined by a single-cell clustering. Splits are determined by -#' one of two metrics at each split: a one-off metric to determine rules for -#' identifying clusters by a single feature, and a balanced metric to determine -#' rules for identifying sets of similar clusters. -#' @param features A L(features) by N(samples) numeric matrix. -#' @param class A vector of K label assignemnts. -#' @param oneoffMetric A character string. What one-off metric to run, either -#' `modified F1` or `pairwise AUC`. -#' @param threshold A numeric value. The threshold for the oneoff metric to use -#' between 0 and 1, 0.95 by default. Smaller values will result is more one-off -#' splits. -#' @param reuseFeatures Logical. Whether or not a feature can be used more than -#' once on the same cluster. Default is TRUE. -#' @param altSplit Logical. Whether or not to force a marker for clusters that -#' are solely defined by the absence of markers. Defulault is TRUE -#' @param consecutiveOneoff Logical. Whether or not to allow one-off splits at -#' consecutive brances. Default it TRUE -#' @return A named list with five elements. -#' \itemize{ -#' \item rules - A named list with one `data.frame` for every label. Each -#' `data.frame` has five columns and gives the set of rules for disinguishing -#' each label. -#' \itemize{ -#' \item feature - Feature identifier. -#' \item direction - Relationship to feature value, -1 if less than, 1 if -#' greater than. -#' \item value - The feature value which defines the decision boundary -#' \item stat - The performance value returned by the splitting metric for -#' this split. -#' \item statUsed - Which performance metric was used. "IG" if information -#' gain and "OO" if one-off. -#' \item level - The level of the tree at which is rule was defined. 1 is the -#' level of the first split of the tree. -#' } -#' \item dendro - A dendrogram object of the decision tree output -#' \item summaryMatrix - A K(labels) by L(features) matrix representation of -#' the decision rules. Columns denote features and rows denote labels. Non-0 -#' values denote instances where a feature was used on a given label. Positive -#' and negative values denote whether the values of the label for that feature -#' were greater than or less than the decision threshold, respectively. The -#' magnitude of Non-0 values denote the level at which the feature was used, -#' where the first split has a magnitude of 1. Note, if reuse_features = TRUE, -#' only the final usage of a feature for a given label is shown. -#' \item prediction - A character vector of label of predictions of the -#' training data using the final model. "MISSING" if label prediction was -#' ambiguous. -#' \item performance - A named list denoting the training performance of the -#' model. -#' \itemize{ -#' \item accuracy - (number correct/number of samples) for the whole set of -#' samples. -#' \item balAcc - mean sensitivity across all labels -#' \item meanPrecision - mean precision across all labels -#' \item correct - the number of correct predictions of each label -#' \item sizes - the number of actual counts of each label -#' \item sensitivity - the sensitivity of the prediciton of each label. -#' \item precision - the precision of the prediciton of each label. -#' } -#' } -#' @examples -#' library(M3DExampleData) -#' counts <- M3DExampleData::Mmus_example_list$data -#' # Subset 500 genes for fast clustering -#' counts <- as.matrix(counts[1501:2000, ]) -#' # Cluster genes ans samples each into 10 modules -#' cm <- celda_CG(counts = counts, L = 10, K = 5, verbose = FALSE) -#' # Get features matrix and cluster assignments -#' factorized <- factorizeMatrix(counts, cm) -#' features <- factorized$proportions$cell -#' class <- clusters(cm)$z -#' # Generate Decision Tree -#' DecTree <- buildTreeHybrid(features, -#' class, -#' oneoffMetric = "modified F1", -#' threshold = 1, -#' consecutiveOneoff = FALSE) -#' -#' # Plot dendrogram -#' plotDendro(DecTree) -#' @export -buildTreeHybrid <- function(features, - class, - oneoffMetric = c("modified F1", "pairwise AUC"), - threshold = 0.95, - reuseFeatures = FALSE, - altSplit = TRUE, - consecutiveOneoff = TRUE) { - - if (ncol(features) != length(class)) { - stop("Number of columns of features must equal length of class") - } - if (any(is.na(class))) { - stop("NA class values") - } - if (any(is.na(features))) { - stop("NA feature values") - } - - # Match the oneoffMetric argument - oneoffMetric <- match.arg(oneoffMetric) - - # Transpose features - features <- t(features) - - # Set class to factor - class <- as.factor(class) - - # Generate list of tree levels - tree <- .generateTreeList( - features, - class, - oneoffMetric, - threshold, - reuseFeatures, - consecutiveOneoff) - - # Add alternative node for the solely down-regulated leaf - if (altSplit) { - tree <- .addAlternativeSplit(tree, features, class) - } - - # Format tree output for plotting and generate summary statistics - DTsummary <- .summarizeTree(tree, features, class) - - return(DTsummary) -} diff --git a/R/celda_C.R b/R/celda_C.R index e2270849..9cdf2c06 100755 --- a/R/celda_C.R +++ b/R/celda_C.R @@ -46,7 +46,7 @@ #' `logfile`. If NULL, messages will be printed to stdout. Default NULL. #' @param verbose Logical. Whether to print log messages. Default TRUE. #' @return An object of class `celda_C` with the cell population clusters -#' stored in in `z`. +#' stored in `z`. #' @seealso `celda_G()` for feature clustering and `celda_CG()` for simultaneous #' clustering of features and cells. `celdaGridSearch()` can be used to run #' multiple values of K and multiple chains in parallel. @@ -433,35 +433,57 @@ celda_C <- function(counts, ix <- sample(seq(nM)) for (i in ix) { ## Subtract cell counts from current population assignment - nGByCP1 <- nGByCP - nGByCP1[, z[i]] <- nGByCP[, z[i]] - counts[, i] - nGByCP1 <- .colSums(lgamma(nGByCP1 + beta), nrow(nGByCP), ncol(nGByCP)) + #nGByCP1 <- nGByCP + #nGByCP1[, z[i]] <- nGByCP[, z[i]] - counts[, i] + #nGByCP1 <- .colSums(lgamma(nGByCP1 + beta), nrow(nGByCP), ncol(nGByCP)) - nCP1 <- nCP - nCP1[z[i]] <- nCP1[z[i]] - nByC[i] - nCP1 <- lgamma(nCP1 + (nG * beta)) + #nCP1 <- nCP + #nCP1[z[i]] <- nCP1[z[i]] - nByC[i] + #nCP1 <- lgamma(nCP1 + (nG * beta)) ## Add cell counts to all other populations - nGByCP2 <- nGByCP - otherIx <- seq(K)[-z[i]] - nGByCP2[, otherIx] <- nGByCP2[, otherIx] + counts[, i] - nGByCP2 <- .colSums(lgamma(nGByCP2 + beta), nrow(nGByCP), ncol(nGByCP)) + #nGByCP2 <- nGByCP + #otherIx <- seq(K)[-z[i]] + #nGByCP2[, otherIx] <- nGByCP2[, otherIx] + counts[, i] + #nGByCP2 <- .colSums(lgamma(nGByCP2 + beta), nrow(nGByCP), ncol(nGByCP)) - nCP2 <- nCP - nCP2[otherIx] <- nCP2[otherIx] + nByC[i] - nCP2 <- lgamma(nCP2 + (nG * beta)) + #nCP2 <- nCP + #nCP2[otherIx] <- nCP2[otherIx] + nByC[i] + #nCP2 <- lgamma(nCP2 + (nG * beta)) mCPByS[z[i], s[i]] <- mCPByS[z[i], s[i]] - 1L ## Calculate probabilities for each state + ## when consider a specific cluster fo this cell, + ## no need to calculate cells in other cluster for (j in seq_len(K)) { - otherIx <- seq(K)[-j] - probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + ## Theta simplified - sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) - nGByCP2[j] - ## Phi Numerator (current cell) - sum(nCP1[otherIx]) - ## Phi Denominator (other cells) - nCP2[j] ## Phi Denominator (current cell) + #otherIx <- seq(K)[-j] + if (j != z[i]) { # when j is not current population assignment + ## Theta simplified + probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + + # if adding this cell -- Phi Numerator + sum(lgamma(nGByCP[, j] + counts[, i] + beta)) - + # if adding this cell -- Phi Denominator + lgamma(nCP[j] + nByC[i] + nG * beta) - + # if without this cell -- Phi Numerator + sum(lgamma(nGByCP[, j] + beta)) + + # if without this cell -- Phi Denominator + lgamma(nCP[j] + nG * beta) + #sum(nGByCP1[otherIx]) + ## Phi Numerator (other cells) + #nGByCP2[j] - ## Phi Numerator (current cell) + #sum(nCP1[otherIx]) - ## Phi Denominator (other cells) + #nCP2[j] - ## Phi Denominator (current cell) + } else { # when j is current population assignment + ## Theta simplified + probs[j, i] <- log(mCPByS[j, s[i]] + alpha) + + sum(lgamma(nGByCP[, j] + beta)) - + lgamma(nCP[j] + nG * beta) - + sum(lgamma(nGByCP[, j] - counts[, i] + beta)) + + lgamma(nCP[j] - nByC[i] + nG * beta) + + } + } ## Sample next state and add back counts @@ -1035,14 +1057,13 @@ setMethod("celdaHeatmap", signature(celdaMod = "celda_C"), #' @param celdaMod Celda object of class `celda_C`. #' @param maxCells Integer. Maximum number of cells to plot. Cells will be #' randomly subsampled if ncol(counts) > maxCells. Larger numbers of cells -#' requires more memory. Default 25000. +#' requires more memory. If NULL, no subsampling will be performed. +#' Default NULL. #' @param minClusterSize Integer. Do not subsample cell clusters below this #' threshold. Default 100. #' @param initialDims Integer. PCA will be used to reduce the dimentionality #' of the dataset. The top 'initialDims' principal components will be used #' for tSNE. Default 20. -#' @param modules Integer vector. Determines which features modules to use for -#' tSNE. If NULL, all modules will be used. Default NULL. #' @param perplexity Numeric. Perplexity parameter for tSNE. Default 20. #' @param maxIter Integer. Maximum number of iterations in tSNE generation. #' Default 2500. @@ -1059,10 +1080,9 @@ setMethod("celdaHeatmap", signature(celdaMod = "celda_C"), setMethod("celdaTsne", signature(celdaMod = "celda_C"), function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, initialDims = 20, - modules = NULL, perplexity = 20, maxIter = 2500, seed = 12345) { @@ -1073,7 +1093,6 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), maxCells = maxCells, minClusterSize = minClusterSize, initialDims = initialDims, - modules = modules, perplexity = perplexity, maxIter = maxIter) } else { @@ -1083,7 +1102,6 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), maxCells = maxCells, minClusterSize = minClusterSize, initialDims = initialDims, - modules = modules, perplexity = perplexity, maxIter = maxIter)) } @@ -1094,18 +1112,16 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), .celdaTsneC <- function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, initialDims = 20, - modules = NULL, perplexity = 20, maxIter = 2500) { preparedCountInfo <- .prepareCountsForDimReductionCeldaC(counts, celdaMod, maxCells, - minClusterSize, - modules) + minClusterSize) res <- .calculateTsne(preparedCountInfo$norm, perplexity = perplexity, @@ -1116,7 +1132,7 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), final <- matrix(NA, nrow = ncol(counts), ncol = 2) final[preparedCountInfo$cellIx, ] <- res rownames(final) <- colnames(counts) - colnames(final) <- c("tsne_1", "tsne_2") + colnames(final) <- c("tSNE1", "tSNE_2") return(final) } @@ -1131,47 +1147,79 @@ setMethod("celdaTsne", signature(celdaMod = "celda_C"), #' @param celdaMod Celda object of class `celda_C`. #' @param maxCells Integer. Maximum number of cells to plot. Cells will be #' randomly subsampled if ncol(counts) > maxCells. Larger numbers of cells -#' requires more memory. Default 25000. +#' requires more memory. If NULL, no subsampling will be performed. +#' Default NULL. #' @param minClusterSize Integer. Do not subsample cell clusters below this #' threshold. Default 100. -#' @param modules Integer vector. Determines which features modules to use for -#' UMAP. If NULL, all modules will be used. Default NULL. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. -#' @param umapConfig An object of class "umap.config" specifying parameters to -#' the UMAP algorithm. +#' @param nNeighbors The size of local neighborhood used for +#' manifold approximation. Larger values result in more global +#' views of the manifold, while smaller values result in more +#' local data being preserved. Default 30. +#' See `?uwot::umap` for more information. +#' @param minDist The effective minimum distance between embedded points. +#' Smaller values will result in a more clustered/clumped +#' embedding where nearby points on the manifold are drawn +#' closer together, while larger values will result on a more +#' even dispersal of points. Default 0.2. +#' See `?uwot::umap` for more information. +#' @param spread The effective scale of embedded points. In combination with +#' ‘min_dist’, this determines how clustered/clumped the +#' embedded points are. Default 1. See `?uwot::umap` for more information. +#' @param pca Logical. Whether to perform +#' dimensionality reduction with PCA before UMAP. +#' @param initialDims Integer. Number of dimensions from PCA to use as +#' input in UMAP. Default 50. +#' @param cores Number of threads to use. Default 1. +#' @param ... Other parameters to pass to `uwot::umap`. #' @seealso `celda_C()` for clustering cells and `celdaHeatmap()` for displaying #' expression. #' @examples #' data(celdaCSim, celdaCMod) #' umapRes <- celdaUmap(celdaCSim$counts, celdaCMod) -#' @return A two column matrix of umap coordinates +#' @return A two column matrix of UMAP coordinates #' @export setMethod("celdaUmap", signature(celdaMod = "celda_C"), function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, - modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) { + nNeighbors = 30, + minDist = 0.75, + spread = 1, + pca = TRUE, + initialDims = 50, + cores = 1, + ...) { if (is.null(seed)) { res <- .celdaUmapC(counts = counts, celdaMod = celdaMod, maxCells = maxCells, minClusterSize = minClusterSize, - modules = modules, - umapConfig = umapConfig) + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + pca = pca, + initialDims = initialDims, + cores = cores, + ...) } else { with_seed(seed, res <- .celdaUmapC(counts = counts, celdaMod = celdaMod, maxCells = maxCells, minClusterSize = minClusterSize, - modules = modules, - umapConfig = umapConfig)) + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + pca = pca, + initialDims = initialDims, + cores = cores, + ...)) } return(res) @@ -1180,36 +1228,48 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), .celdaUmapC <- function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, - modules = NULL, - umapConfig = umap::umap.defaults) { + nNeighbors = 30, + minDist = 0.2, + spread = 1, + pca = TRUE, + initialDims = 50, + cores = 1, + ...) { preparedCountInfo <- .prepareCountsForDimReductionCeldaC(counts, celdaMod, maxCells, - minClusterSize, - modules) - res <- .calculateUmap(preparedCountInfo$norm, umapConfig) + minClusterSize) + umapRes <- .calculateUmap(preparedCountInfo$norm, + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + pca = pca, + initialDims = initialDims, + cores = cores, + ...) + final <- matrix(NA, nrow = ncol(counts), ncol = 2) - final[preparedCountInfo$cellIx, ] <- res + final[preparedCountInfo$cellIx, ] <- umapRes rownames(final) <- colnames(counts) - colnames(final) <- c("umap_1", "umap_2") + colnames(final) <- c("UMAP_1", "UMAP_2") return(final) } .prepareCountsForDimReductionCeldaC <- function(counts, celdaMod, - maxCells = 25000, - minClusterSize = 100, - modules = NULL) { + maxCells = NULL, + minClusterSize = 100) { counts <- .processCounts(counts) compareCountMatrix(counts, celdaMod) ## Checking if maxCells and minClusterSize will work - if ((maxCells < ncol(counts)) & + if (!is.null(maxCells)) { + if ((maxCells < ncol(counts)) & (maxCells / minClusterSize < params(celdaMod)$K)) { stop("Cannot distribute ", @@ -1220,6 +1280,9 @@ setMethod("celdaUmap", signature(celdaMod = "celda_C"), minClusterSize, " cells per cluster. Try increasing 'maxCells' or decreasing", " 'minClusterSize'.") + } + } else { + maxCells <- ncol(counts) } ## Select a subset of cells to sample if greater than 'maxCells' diff --git a/R/celda_CG.R b/R/celda_CG.R index b17f153c..8f13132e 100755 --- a/R/celda_CG.R +++ b/R/celda_CG.R @@ -58,7 +58,7 @@ #' `logfile`. If NULL, messages will be printed to stdout. Default NULL. #' @param verbose Logical. Whether to print log messages. Default TRUE. #' @return An object of class `celda_CG` with the cell populations clusters -#' stored in in `z` and feature module clusters stored in `y`. +#' stored in `z` and feature module clusters stored in `y`. #' @seealso `celda_G()` for feature clustering and `celda_C()` for clustering #' cells. `celdaGridSearch()` can be used to run multiple values of K/L and #' multiple chains in parallel. @@ -1308,7 +1308,8 @@ setMethod("celdaHeatmap", signature(celdaMod = "celda_CG"), #' @param celdaMod Celda object of class `celda_CG`. #' @param maxCells Integer. Maximum number of cells to plot. Cells will be #' randomly subsampled if ncol(counts) > maxCells. Larger numbers of cells -#' requires more memory. Default 25000. +#' requires more memory. If NULL, no subsampling will be performed. +#' Default NULL. #' @param minClusterSize Integer. Do not subsample cell clusters below this #' threshold. Default 100. #' @param initialDims Integer. PCA will be used to reduce the dimentionality @@ -1332,7 +1333,7 @@ setMethod("celdaHeatmap", signature(celdaMod = "celda_CG"), setMethod("celdaTsne", signature(celdaMod = "celda_CG"), function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, initialDims = 20, modules = NULL, @@ -1367,7 +1368,7 @@ setMethod("celdaTsne", signature(celdaMod = "celda_CG"), .celdaTsneCG <- function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, initialDims = 20, modules = NULL, @@ -1388,7 +1389,7 @@ setMethod("celdaTsne", signature(celdaMod = "celda_CG"), final <- matrix(NA, nrow = ncol(counts), ncol = 2) final[preparedCountInfo$cellIx, ] <- res rownames(final) <- colnames(counts) - colnames(final) <- c("tsne_1", "tsne_2") + colnames(final) <- c("tSNE_1", "tSNE_2") return(final) } @@ -1405,16 +1406,32 @@ setMethod("celdaTsne", signature(celdaMod = "celda_CG"), #' @param celdaMod Celda object of class `celda_CG`. #' @param maxCells Integer. Maximum number of cells to plot. Cells will be #' randomly subsampled if ncol(counts) > maxCells. Larger numbers of cells -#' requires more memory. Default 25000. +#' requires more memory. If NULL, no subsampling will be performed. +#' Default NULL. #' @param minClusterSize Integer. Do not subsample cell clusters below this #' threshold. Default 100. #' @param modules Integer vector. Determines which features modules to use for -#' tSNE. If NULL, all modules will be used. Default NULL. +#' UMAP. If NULL, all modules will be used. Default NULL. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. -#' @param umapConfig Object of class `umap.config`. Configures parameters for -#' umap. Default `umap::umap.defaults`. +#' @param nNeighbors The size of local neighborhood used for +#' manifold approximation. Larger values result in more global +#' views of the manifold, while smaller values result in more +#' local data being preserved. Default 30. +#' See `?uwot::umap` for more information. +#' @param minDist The effective minimum distance between embedded points. +#' Smaller values will result in a more clustered/clumped +#' embedding where nearby points on the manifold are drawn +#' closer together, while larger values will result on a more +#' even dispersal of points. Default 0.2. +#' See `?uwot::umap` for more information. +#' @param spread The effective scale of embedded points. In combination with +#' ‘min_dist’, this determines how clustered/clumped the +#' embedded points are. Default 1. +#' See `?uwot::umap` for more information. +#' @param cores Number of threads to use. Default 1. +#' @param ... Other parameters to pass to `uwot::umap`. #' @seealso `celda_CG()` for clustering features and cells and `celdaHeatmap()` #' for displaying expression. #' @examples @@ -1425,11 +1442,15 @@ setMethod("celdaTsne", signature(celdaMod = "celda_CG"), setMethod("celdaUmap", signature(celdaMod = "celda_CG"), function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) { + nNeighbors = 30, + minDist = 0.75, + spread = 1, + cores = 1, + ...) { if (is.null(seed)) { res <- .celdaUmapCG(counts = counts, @@ -1437,7 +1458,11 @@ setMethod("celdaUmap", maxCells = maxCells, minClusterSize = minClusterSize, modules = modules, - umapConfig = umapConfig) + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + cores = cores, + ...) } else { with_seed(seed, res <- .celdaUmapCG(counts = counts, @@ -1445,7 +1470,11 @@ setMethod("celdaUmap", maxCells = maxCells, minClusterSize = minClusterSize, modules = modules, - umapConfig = umapConfig)) + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + cores = cores, + ...)) } return(res) @@ -1454,33 +1483,44 @@ setMethod("celdaUmap", .celdaUmapCG <- function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, modules = NULL, - umapConfig = umap::umap.defaults) { + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + cores = cores, + ...) { preparedCountInfo <- .prepareCountsForDimReductionCeldaCG(counts, celdaMod, maxCells, minClusterSize, modules) - umapRes <- .calculateUmap(preparedCountInfo$norm, umapConfig) + umapRes <- .calculateUmap(preparedCountInfo$norm, + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + cores = cores, + ...) + final <- matrix(NA, nrow = ncol(counts), ncol = 2) final[preparedCountInfo$cellIx, ] <- umapRes rownames(final) <- colnames(counts) - colnames(final) <- c("umap_1", "umap_2") + colnames(final) <- c("UMAP_1", "UMAP_2") return(final) } .prepareCountsForDimReductionCeldaCG <- function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, modules = NULL) { ## Checking if maxCells and minClusterSize will work - if ((maxCells < ncol(counts)) & + if (!is.null(maxCells)) { + if ((maxCells < ncol(counts)) & (maxCells / minClusterSize < params(celdaMod)$K)) { stop("Cannot distribute ", @@ -1491,6 +1531,9 @@ setMethod("celdaUmap", minClusterSize, " cells per cluster. Try increasing 'maxCells' or", " decreasing 'minClusterSize'.") + } + } else { + maxCells <- ncol(counts) } fm <- factorizeMatrix(counts = counts, diff --git a/R/celda_G.R b/R/celda_G.R index ffd43b3e..5f2b566a 100755 --- a/R/celda_G.R +++ b/R/celda_G.R @@ -1000,7 +1000,8 @@ setMethod("celdaHeatmap", signature(celdaMod = "celda_G"), #' @param celdaMod Celda object of class `celda_G`. #' @param maxCells Integer. Maximum number of cells to plot. Cells will be #' randomly subsampled if ncol(conts) > maxCells. Larger numbers of cells -#' requires more memory. Default 10000. +#' requires more memory. If NULL, no subsampling will be performed. +#' Default NULL. #' @param minClusterSize Integer. Do not subsample cell clusters below this #' threshold. Default 100. #' @param initialDims Integer. PCA will be used to reduce the dimentionality of @@ -1024,7 +1025,7 @@ setMethod("celdaHeatmap", signature(celdaMod = "celda_G"), setMethod("celdaTsne", signature(celdaMod = "celda_G"), function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, initialDims = 20, modules = NULL, @@ -1060,7 +1061,7 @@ setMethod("celdaTsne", signature(celdaMod = "celda_G"), .celdaTsneG <- function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, initialDims = 20, modules = NULL, @@ -1076,9 +1077,11 @@ setMethod("celdaTsne", signature(celdaMod = "celda_G"), doPca = FALSE, perplexity = perplexity, maxIter = maxIter) - rownames(res) <- colnames(counts) - colnames(res) <- c("tsne_1", "tsne_2") - return(res) + final <- matrix(NA, nrow = ncol(counts), ncol = 2) + final[preparedCountInfo$cellIx, ] <- res + rownames(final) <- colnames(counts) + colnames(final) <- c("tSNE_1", "tSNE_2") + return(final) } @@ -1093,16 +1096,32 @@ setMethod("celdaTsne", signature(celdaMod = "celda_G"), #' @param celdaMod Celda object of class `celda_CG`. #' @param maxCells Integer. Maximum number of cells to plot. Cells will be #' randomly subsampled if ncol(counts) > maxCells. Larger numbers of cells -#' requires more memory. Default 25000. +#' requires more memory. If NULL, no subsampling will be performed. +#' Default NULL. #' @param minClusterSize Integer. Do not subsample cell clusters below this #' threshold. Default 100. #' @param modules Integer vector. Determines which features modules to use for -#' tSNE. If NULL, all modules will be used. Default NULL. +#' UMAP. If NULL, all modules will be used. Default NULL. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. -#' @param umapConfig Object of class `umap.config`. Configures parameters for -#' umap. Default `umap::umap.defaults`. +#' @param nNeighbors The size of local neighborhood used for +#' manifold approximation. Larger values result in more global +#' views of the manifold, while smaller values result in more +#' local data being preserved. Default 30. +#' See `?uwot::umap` for more information. +#' @param minDist The effective minimum distance between embedded points. +#' Smaller values will result in a more clustered/clumped +#' embedding where nearby points on the manifold are drawn +#' closer together, while larger values will result on a more +#' even dispersal of points. Default 0.2. +#' See `?uwot::umap` for more information. +#' @param spread The effective scale of embedded points. In combination with +#' ‘min_dist’, this determines how clustered/clumped the +#' embedded points are. Default 1. +#' See `?uwot::umap` for more information. +#' @param cores Number of threads to use. Default 1. +#' @param ... Other parameters to pass to `uwot::umap`. #' @seealso `celda_G()` for clustering features and cells and `celdaHeatmap()` #' for displaying expression #' @examples @@ -1113,11 +1132,15 @@ setMethod("celdaTsne", signature(celdaMod = "celda_G"), setMethod("celdaUmap", signature(celdaMod = "celda_G"), function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) { + nNeighbors = 30, + minDist = 0.2, + spread = 1, + cores = 1, + ...) { if (is.null(seed)) { res <- .celdaUmapG(counts = counts, @@ -1125,7 +1148,11 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), maxCells = maxCells, minClusterSize = minClusterSize, modules = modules, - umapConfig = umapConfig) + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + cores = cores, + ...) } else { with_seed(seed, res <- .celdaUmapG(counts = counts, @@ -1133,7 +1160,11 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), maxCells = maxCells, minClusterSize = minClusterSize, modules = modules, - umapConfig = umapConfig)) + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + cores = cores, + ...)) } return(res) @@ -1142,33 +1173,46 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), .celdaUmapG <- function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, modules = NULL, - umapConfig = umap::umap.defaults) { + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + cores = cores, + ...) { preparedCountInfo <- .prepareCountsForDimReductionCeldaCG(counts, celdaMod, maxCells, minClusterSize, modules) - umapRes <- .calculateUmap(preparedCountInfo$norm, umapConfig) + umapRes <- .calculateUmap(preparedCountInfo$norm, + nNeighbors = nNeighbors, + minDist = minDist, + spread = spread, + cores = cores, + ...) + final <- matrix(NA, nrow = ncol(counts), ncol = 2) final[preparedCountInfo$cellIx, ] <- umapRes rownames(final) <- colnames(counts) - colnames(final) <- c("umap_1", "umap_2") + colnames(final) <- c("UMAP_1", "UMAP_2") return(final) } .prepareCountsForDimReductionCeldaCG <- function(counts, celdaMod, - maxCells = 25000, + maxCells = NULL, minClusterSize = 100, modules = NULL) { - if (maxCells > ncol(counts)) { - maxCells <- ncol(counts) + if (is.null(maxCells) || maxCells > ncol(counts)) { + maxCells <- ncol(counts) + cellIx <- seq_len(ncol(counts)) + } else { + cellIx <- sample(seq(ncol(counts)), maxCells) } fm <- factorizeMatrix(counts = counts, @@ -1185,7 +1229,6 @@ setMethod("celdaUmap", signature(celdaMod = "celda_G"), modulesToUse <- modules } - cellIx <- sample(seq(ncol(counts)), maxCells) norm <- t(normalizeCounts(fm$counts$cell[modulesToUse, cellIx], normalize = "proportion", transformationFun = sqrt)) diff --git a/R/celda_heatmap.R b/R/celda_heatmap.R index e3d1e7f4..80fb4046 100644 --- a/R/celda_heatmap.R +++ b/R/celda_heatmap.R @@ -79,6 +79,8 @@ plotHeatmap <- function(counts, z = NULL, y = NULL, + rowGroupOrder = NULL, + colGroupOrder = NULL, scaleRow = scale, trim = c(-2, 2), featureIx = NULL, @@ -276,6 +278,8 @@ plotHeatmap <- function(counts, treeHeightCol = treeheightCell, rowLabel = y, colLabel = z, + rowGroupOrder = rowGroupOrder, + colGroupOrder = colGroupOrder, silent = TRUE, ...) diff --git a/R/decon.R b/R/decon.R index f296e493..5e970439 100644 --- a/R/decon.R +++ b/R/decon.R @@ -1,134 +1,665 @@ - - -#' @title Simulate contaminated count matrix -#' @description This function generates a list containing two count matrices -- -#' one for real expression, the other one for contamination, as well as other -#' parameters used in the simulation which can be useful for running -#' decontamination. -#' @param C Integer. Number of cells to be simulated. Default to be 300. -#' @param G Integer. Number of genes to be simulated. Default to be 100. -#' @param K Integer. Number of cell populations to be simulated. Default to be -#' 3. -#' @param NRange Integer vector. A vector of length 2 that specifies the lower -#' and upper bounds of the number of counts generated for each cell. Default to -#' be c(500, 1000). -#' @param beta Numeric. Concentration parameter for Phi. Default to be 0.5. -#' @param delta Numeric or Numeric vector. Concentration parameter for Theta. If -#' input as a single numeric value, symmetric values for beta distribution are -#' specified; if input as a vector of lenght 2, the two values will be the -#' shape1 and shape2 paramters of the beta distribution respectively. +#' @title Contamination estimation with decontX +#' +#' @description Identifies contamination from factors such as ambient RNA +#' in single cell genomic datasets. +#' +#' @name decontX +#' +#' @param x A numeric matrix of counts or a \linkS4class{SingleCellExperiment} +#' with the matrix located in the assay slot under \code{assayName}. +#' Cells in each batch will be subsetted and converted to a sparse matrix +#' of class \code{dgCMatrix} from package \link{Matrix} before analysis. +#' @param assayName Character. Name of the assay to use if \code{x} is a +#' \linkS4class{SingleCellExperiment}. +#' @param z Numeric or character vector. Cell cluster labels. If NULL, +#' Celda will be used to reduce the dimensionality of the dataset +#' to 'L' modules, '\link[uwot]{umap}' from the 'uwot' package +#' will be used to further reduce the dataset to 2 dimenions and +#' the '\link[dbscan]{dbscan}' function from the 'dbscan' package +#' will be used to identify clusters of broad cell types. Default NULL. +#' @param batch Numeric or character vector. Batch labels for cells. +#' If batch labels are supplied, DecontX is run on cells from each +#' batch separately. Cells run in different channels or assays +#' should be considered different batches. Default NULL. +#' @param maxIter Integer. Maximum iterations of the EM algorithm. Default 500. +#' @param convergence Numeric. The EM algorithm will be stopped if the maximum +#' difference in the contamination estimates between the previous and +#' current iterations is less than this. Default 0.001. +#' @param iterLogLik Integer. Calculate log likelihood every 'iterLogLik' +#' iteration. Default 10. +#' @param delta Numeric. Symmetric Dirichlet concentration parameter +#' to initialize theta. Default 10. +#' @param varGenes Integer. The number of variable genes to use in +#' Celda clustering. Variability is calcualted using +#' \code{\link[scran]{modelGeneVar}} function from the 'scran' package. +#' Used only when z is not provided. Default 5000. +#' @param L Integer. Number of modules for Celda clustering. Used to reduce +#' the dimensionality of the dataset before applying UMAP and dbscan. +#' Used only when z is not provided. Default 50. +#' @param dbscanEps Numeric. The clustering resolution parameter +#' used in '\link[dbscan]{dbscan}' to estimate broad cell clusters. +#' Used only when z is not provided. Default 1. #' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, #' a default value of 12345 is used. If NULL, no calls to #' \link[withr]{with_seed} are made. -#' @return A list object containing the real expression matrix and contamination -#' expression matrix as well as other parameters used in the simulation. +#' @param logfile Character. Messages will be redirected to a file named +#' `logfile`. If NULL, messages will be printed to stdout. Default NULL. +#' @param verbose Logical. Whether to print log messages. Default TRUE. +#' +#' @return If \code{x} is a matrix-like object, a list will be returned +#' with the following items: +#' \describe{ +#' \item{\code{decontXcounts}:}{The decontaminated matrix. Values obtained +#' from the variational inference procedure may be non-integer. However, +#' integer counts can be obtained by rounding, +#' e.g. \code{round(decontXcounts)}.} +#' \item{\code{contamination}:}{Percentage of contamination in each cell.} +#' \item{\code{estimates}:}{List of estimated parameters for each batch. If z +#' was not supplied, then the UMAP coordinates used to generated cell +#' cluster labels will also be stored here.} +#' \item{\code{z}:}{Cell population/cluster labels used for analysis.} +#' \item{\code{runParams}:}{List of arguments used in the function call.} +#' } +#' +#' If \code{x} is a \linkS4class{SingleCellExperiment}, then the decontaminated +#' counts will be stored as an assay and can be accessed with +#' \code{decontXcounts(x)}. The contamination values and cluster labels +#' will be stored in \code{colData(x)}. \code{estimates} and \code{runParams} +#' will be stored in \code{metadata(x)$decontX}. If z was not supplied, then +#' the UMAPs used to generated cell cluster labels will be stored in +#' \code{reducedDims} slot in \code{x} +#' #' @examples -#' contaminationSim <- simulateContaminatedMatrix(K = 3, delta = c(1, 9)) -#' contaminationSim <- simulateContaminatedMatrix(K = 3, delta = 1) +#' s <- simulateContaminatedMatrix() +#' result <- decontX(s$observedCounts, s$z) +#' contamination <- colSums(s$observedCounts - s$nativeCounts) / +#' colSums(s$observedCounts) +#' plot(contamination, result$contamination) +NULL + #' @export -simulateContaminatedMatrix <- function(C = 300, - G = 100, - K = 3, - NRange = c(500, 1000), - beta = 0.5, - delta = c(1, 2), - seed = 12345) { +setGeneric("decontX", function(x, ...) standardGeneric("decontX")) - if (is.null(seed)) { - res <- .simulateContaminatedMatrix(C = C, - G = G, - K = K, - NRange = NRange, - beta = beta, - delta = delta) + +######################### +# Setting up S4 methods # +######################### + + +#' @export +#' @rdname decontX +setMethod("decontX", "SingleCellExperiment", function(x, + assayName = "counts", + z = NULL, + batch = NULL, + maxIter = 500, + delta = 10, + convergence = 0.001, + iterLogLik = 10, + varGenes = 5000, + dbscanEps = 1, + L = 50, + seed = 12345, + logfile = NULL, + verbose = TRUE) { + mat <- SummarizedExperiment::assay(x, i = assayName) + result <- .decontX( + counts = mat, + z = z, + batch = batch, + maxIter = maxIter, + convergence = convergence, + iterLogLik = iterLogLik, + delta = delta, + varGenes = varGenes, + L = L, + dbscanEps = dbscanEps, + seed = seed, + logfile = logfile, + verbose = verbose + ) + + ## Add results into column annotation + colData(x) <- cbind(colData(x), + decontX_Contamination = result$contamination, + decontX_Clusters = result$z + ) + + ## Put estimated UMAPs into SCE if z was estimated with Celda/UMAP + if (is.null(result$runParams$z)) { + batchIndex <- unique(result$runParams$batch) + if (length(batchIndex) > 1) { + for (i in batchIndex) { + + ## Each individual UMAP will only be for one batch so need + ## to put NAs in for cells in other batches + tempUMAP <- matrix(NA, ncol = 2, nrow = ncol(mat)) + tempUMAP[result$runParams$batch == i, ] <- result$estimates[[i]]$UMAP + colnames(tempUMAP) <- c("UMAP_1", "UMAP_2") + rownames(tempUMAP) <- colnames(mat) + + SingleCellExperiment::reducedDim( + x, + paste("decontX", i, "UMAP", sep = "_") + ) <- tempUMAP + } } else { - with_seed(seed, - res <- .simulateContaminatedMatrix(C = C, - G = G, - K = K, - NRange = NRange, - beta = beta, - delta = delta)) + SingleCellExperiment::reducedDim(x, "decontX_UMAP") <- + result$estimates[[batchIndex]]$UMAP } + } - return(res) + + ## Save the rest of the result object into metadata + decontXcounts(x) <- result$decontXcounts + result$decontXcounts <- NULL + metadata(x)$decontX <- result + + return(x) +}) + +#' @export +#' @rdname decontX +setMethod("decontX", "ANY", function(x, + z = NULL, + batch = NULL, + maxIter = 500, + delta = 10, + convergence = 0.001, + iterLogLik = 10, + varGenes = 5000, + dbscanEps = 1, + L = 50, + seed = 12345, + logfile = NULL, + verbose = TRUE) { + .decontX( + counts = x, + z = z, + batch = batch, + maxIter = maxIter, + convergence = convergence, + iterLogLik = iterLogLik, + delta = delta, + varGenes = varGenes, + L = L, + dbscanEps = dbscanEps, + seed = seed, + logfile = logfile, + verbose = verbose + ) +}) + + +## Copied from SingleCellExperiment Package + +GET_FUN <- function(exprs_values, ...) { + (exprs_values) # To ensure evaluation + function(object, ...) { + assay(object, i = exprs_values, ...) + } } +SET_FUN <- function(exprs_values, ...) { + (exprs_values) # To ensure evaluation + function(object, ..., value) { + assay(object, i = exprs_values, ...) <- value + object + } +} -.simulateContaminatedMatrix <- function(C = 300, - G = 100, - K = 3, - NRange = c(500, 1000), - beta = 0.5, - delta = c(1, 2)) { - if (length(delta) == 1) { - cpByC <- stats::rbeta(n = C, - shape1 = delta, - shape2 = delta) +#' @export +setGeneric("decontXcounts", function(object, ...) { + standardGeneric("decontXcounts") +}) + +#' @export +setGeneric("decontXcounts<-", function(object, ..., value) { + standardGeneric("decontXcounts<-") +}) + +#' @export +setMethod("decontXcounts", "SingleCellExperiment", GET_FUN("decontXcounts")) + +#' @export +setReplaceMethod( + "decontXcounts", c("SingleCellExperiment", "ANY"), + SET_FUN("decontXcounts") +) + + + + +########################## +# Core Decontx Functions # +########################## + +.decontX <- function(counts, + z = NULL, + batch = NULL, + maxIter = 200, + convergence = 0.001, + iterLogLik = 10, + delta = 10, + varGenes = NULL, + L = NULL, + dbscanEps = NULL, + seed = 12345, + logfile = NULL, + verbose = TRUE) { + startTime <- Sys.time() + .logMessages(paste(rep("-", 50), collapse = ""), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + .logMessages("Starting DecontX", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + .logMessages(paste(rep("-", 50), collapse = ""), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + + runParams <- list( + z = z, + batch = batch, + maxIter = maxIter, + delta = delta, + convergence = convergence, + varGenes = varGenes, + L = L, + dbscanEps = dbscanEps, + logfile = logfile, + verbose = verbose + ) + + totalGenes <- nrow(counts) + totalCells <- ncol(counts) + geneNames <- rownames(counts) + nC <- ncol(counts) + allCellNames <- colnames(counts) + + ## Set up final deconaminated matrix + estRmat <- Matrix::Matrix( + data = 0, + ncol = totalCells, + nrow = totalGenes, + sparse = TRUE, + dimnames = list(geneNames, allCellNames) + ) + + ## Generate batch labels if none were supplied + if (is.null(batch)) { + batch <- rep("all_cells", nC) + } + runParams$batch <- batch + batchIndex <- unique(batch) + + ## Set result lists upfront for all cells from different batches + logLikelihood <- c() + estConp <- rep(NA, nC) + returnZ <- rep(NA, nC) + resBatch <- list() + + ## Cycle through each sample/batch and run DecontX + for (bat in batchIndex) { + if (length(batchIndex) == 1) { + .logMessages( + date(), + ".. Analyzing all cells", + logfile = logfile, + append = TRUE, + verbose = verbose + ) } else { - cpByC <- stats::rbeta(n = C, - shape1 = delta[1], - shape2 = delta[2]) + .logMessages( + date(), + " .. Analyzing cells in batch '", + bat, "'", + sep = "", + logfile = logfile, + append = TRUE, + verbose = verbose + ) } - z <- sample(seq(K), size = C, replace = TRUE) - if (length(unique(z)) < K) { - warning( - "Only ", - length(unique(z)), - " clusters are simulated. Try to increase numebr of cells 'C' if", - " more clusters are needed" - ) - K <- length(unique(z)) - z <- plyr::mapvalues(z, unique(z), seq(length(unique(z)))) + zBat <- NULL + countsBat <- counts[, batch == bat] + + ## Convert to sparse matrix + if (!inherits(countsBat, "dgCMatrix")) { + .logMessages( + date(), + ".... Converting to sparse matrix", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + countsBat <- as(countsBat, "dgCMatrix") } - NbyC <- sample(seq(min(NRange), max(NRange)), - size = C, - replace = TRUE) - cNbyC <- vapply(seq(C), function(i) { - stats::rbinom(n = 1, - size = NbyC[i], - p = cpByC[i]) - }, integer(1)) - rNbyC <- NbyC - cNbyC - - phi <- .rdirichlet(K, rep(beta, G)) - - ## sample real expressed count matrix - cellRmat <- vapply(seq(C), function(i) { - stats::rmultinom(1, size = rNbyC[i], prob = phi[z[i], ]) - }, integer(G)) - - rownames(cellRmat) <- paste0("Gene_", seq(G)) - colnames(cellRmat) <- paste0("Cell_", seq(C)) - - ## sample contamination count matrix - nGByK <- - rowSums(cellRmat) - .colSumByGroup(cellRmat, group = z, K = K) - eta <- normalizeCounts(counts = nGByK, normalize = "proportion") - - cellCmat <- vapply(seq(C), function(i) { - stats::rmultinom(1, size = cNbyC[i], prob = eta[, z[i]]) - }, integer(G)) - cellOmat <- cellRmat + cellCmat - - rownames(cellOmat) <- paste0("Gene_", seq(G)) - colnames(cellOmat) <- paste0("Cell_", seq(C)) - - return( - list( - "nativeCounts" = cellRmat, - "observedCounts" = cellOmat, - "NByC" = NbyC, - "z" = z, - "eta" = eta, - "phi" = t(phi) + + if (!is.null(z)) { + zBat <- z[batch == bat] + } + if (is.null(seed)) { + res <- .decontXoneBatch( + counts = countsBat, + z = zBat, + batch = bat, + maxIter = maxIter, + delta = delta, + convergence = convergence, + iterLogLik = iterLogLik, + logfile = logfile, + verbose = verbose, + varGenes = varGenes, + dbscanEps = dbscanEps, + L = L, + seed = seed + ) + } else { + withr::with_seed( + seed, + res <- .decontXoneBatch( + counts = countsBat, + z = zBat, + batch = bat, + maxIter = maxIter, + delta = delta, + convergence = convergence, + iterLogLik = iterLogLik, + logfile = logfile, + verbose = verbose, + varGenes = varGenes, + dbscanEps = dbscanEps, + L = L, + seed = seed ) + ) + } + estRmat <- calculateNativeMatrix( + counts = countsBat, + native_counts = estRmat, + theta = res$theta, + eta = res$eta, + row_index = seq(nrow(counts)), + col_index = which(batch == bat), + phi = res$phi, + z = as.integer(res$z), + pseudocount = 1e-20 + ) + + resBatch[[bat]] <- list( + z = res$z, + phi = res$phi, + eta = res$eta, + delta = res$delta, + theta = res$theta, + logLikelihood = res$logLikelihood, + UMAP = res$UMAP, + z = res$z, + iteration = res$iteration + ) + + estConp[batch == bat] <- res$contamination + if (length(batchIndex) > 1) { + returnZ[batch == bat] <- paste0(bat, "-", res$z) + } else { + returnZ[batch == bat] <- res$z + } + + } + names(resBatch) <- batchIndex + + returnResult <- list( + "runParams" = runParams, + "estimates" = resBatch, + "decontXcounts" = estRmat, + "contamination" = estConp, + "z" = returnZ + ) + + ## Try to convert class of new matrix to class of original matrix + if (inherits(counts, "dgCMatrix")) { + .logMessages( + date(), + ".. Finalizing decontaminated matrix", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + } + + if (inherits(counts, c("DelayedMatrix", "DelayedArray"))) { + + ## Determine class of seed in DelayedArray + seed.class <- unique(DelayedArray::seedApply(counts, class))[[1]] + if (seed.class == "HDF5ArraySeed") { + returnResult$decontXcounts <- as(returnResult$decontXcounts, "HDF5Matrix") + } else { + if (isTRUE(canCoerce(returnResult$decontXcounts, seed.class))) { + returnResult$decontXcounts <- as(returnResult$decontXcounts, seed.class) + } + } + returnResult$decontXcounts <- + DelayedArray::DelayedArray(returnResult$decontXcounts) + } else { + try({ + if (canCoerce(returnResult$decontXcounts, class(counts))) { + returnResult$decontXcounts <- + as(returnResult$decontXcounts, class(counts)) + } + }, + silent = TRUE + ) + } + + endTime <- Sys.time() + .logMessages(paste(rep("-", 50), collapse = ""), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + .logMessages("Completed DecontX. Total time:", + format(difftime(endTime, startTime)), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + .logMessages(paste(rep("-", 50), collapse = ""), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + + return(returnResult) +} + + +# This function updates decontamination for one batch +# seed passed to this function is to be furhter passed to +# function .decontxInitializeZ() +.decontXoneBatch <- function(counts, + z = NULL, + batch = NULL, + maxIter = 200, + delta = 10, + convergence = 0.01, + iterLogLik = 10, + logfile = NULL, + verbose = TRUE, + varGenes = NULL, + dbscanEps = NULL, + L = NULL, + seed = 12345) { + .checkCountsDecon(counts) + .checkParametersDecon(proportionPrior = delta) + + # nG <- nrow(counts) + nC <- ncol(counts) + deconMethod <- "clustering" + + ## Generate cell cluster labels if none are provided + umap <- NULL + if (is.null(z)) { + .logMessages( + date(), + ".... Estimating cell types with Celda", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + ## Always uses clusters for DecontX estimation + # deconMethod <- "background" + + varGenes <- .processvarGenes(varGenes) + dbscanEps <- .processdbscanEps(dbscanEps) + L <- .processL(L) + + celda.init <- .decontxInitializeZ( + object = counts, + varGenes = varGenes, + L = L, + dbscanEps = dbscanEps, + verbose = verbose, + seed = seed, + logfile = logfile + ) + z <- celda.init$z + umap <- celda.init$umap + colnames(umap) <- c( + "DecontX_UMAP_1", + "DecontX_UMAP_2" + ) + rownames(umap) <- colnames(counts) + } + + z <- .processCellLabels(z, numCells = nC) + K <- length(unique(z)) + + iter <- 1L + numIterWithoutImprovement <- 0L + stopIter <- 3L + + .logMessages( + date(), + ".... Estimating contamination", + logfile = logfile, + append = TRUE, + verbose = verbose + ) + + if (deconMethod == "clustering") { + ## Initialization + deltaInit <- delta + theta <- stats::rbeta( + n = nC, + shape1 = deltaInit, + shape2 = deltaInit + ) + + + nextDecon <- decontXInitialize( + counts = counts, + theta = theta, + z = z, + pseudocount = 1e-20 + ) + phi <- nextDecon$phi + eta <- nextDecon$eta + + ll <- c() + llRound <- decontXLogLik( + counts = counts, + z = z, + phi = phi, + eta = eta, + theta = theta, + pseudocount = 1e-20 ) + + ## EM updates + theta.previous <- theta + converged <- FALSE + counts.colsums <- Matrix::colSums(counts) + while (iter <= maxIter & !isTRUE(converged) & + numIterWithoutImprovement <= stopIter) { + + nextDecon <- decontXEM( + counts = counts, + counts_colsums = counts.colsums, + phi = phi, + eta = eta, + theta = theta, + z = z, + pseudocount = 1e-20 + ) + + theta <- nextDecon$theta + phi <- nextDecon$phi + eta <- nextDecon$eta + delta <- nextDecon$delta + + max.divergence <- max(abs(theta.previous - theta)) + if (max.divergence < convergence) { + converged <- TRUE + } + theta.previous <- theta + + ## Calculate likelihood and check for convergence + if (iter %% iterLogLik == 0 || converged) { + llTemp <- decontXLogLik( + counts = counts, + z = z, + phi = phi, + eta = eta, + theta = theta, + pseudocount = 1e-20 + ) + + ll <- c(ll, llTemp) + + .logMessages(date(), + "...... Completed iteration:", + iter, + "| converge:", + signif(max.divergence, 4), + logfile = logfile, + append = TRUE, + verbose = verbose + ) + } + + iter <- iter + 1L + } + } + + # resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) + resConp <- nextDecon$contamination + names(resConp) <- colnames(counts) + + return(list( + "logLikelihood" = ll, + "contamination" = resConp, + "theta" = theta, + "delta" = delta, + "phi" = phi, + "eta" = eta, + "UMAP" = umap, + "iteration" = iter - 1L, + "z" = z + )) } + + + # This function calculates the log-likelihood # # counts Numeric/Integer matrix. Observed count matrix, rows represent features @@ -140,23 +671,24 @@ simulateContaminatedMatrix <- function(C = 300, # populations # theta Numeric vector. Proportion of truely expressed transcripts .deconCalcLL <- function(counts, z, phi, eta, theta) { - # ll = sum( t(counts) * log( (1-conP )*geneDist[z,] + conP * conDist[z, ] + - # 1e-20 ) ) # when dist_mat are K x G matrices - ll <- sum(t(counts) * log(theta * t(phi)[z, ] + - (1 - theta) * t(eta)[z, ] + 1e-20)) - return(ll) + # ll = sum( t(counts) * log( (1-conP )*geneDist[z,] + conP * conDist[z, ] + + # 1e-20 ) ) # when dist_mat are K x G matrices + ll <- sum(Matrix::t(counts) * log(theta * t(phi)[z, ] + + (1 - theta) * t(eta)[z, ] + 1e-20)) + return(ll) } +# DEPRECATED. This is not used, but is kept as it might be useful in the future # This function calculates the log-likelihood of background distribution # decontamination # bgDist Numeric matrix. Rows represent feature and columns are the times that # the background-distribution has been replicated. .bgCalcLL <- function(counts, globalZ, cbZ, phi, eta, theta) { - # ll <- sum(t(counts) * log(theta * t(cellDist) + - # (1 - theta) * t(bgDist) + 1e-20)) - ll <- sum(t(counts) * log(theta * t(phi)[cbZ, ] + - (1 - theta) * t(eta)[globalZ, ] + 1e-20)) - return(ll) + # ll <- sum(t(counts) * log(theta * t(cellDist) + + # (1 - theta) * t(bgDist) + 1e-20)) + ll <- sum(t(counts) * log(theta * t(phi)[cbZ, ] + + (1 - theta) * t(eta)[globalZ, ] + 1e-20)) + return(ll) } @@ -168,595 +700,436 @@ simulateContaminatedMatrix <- function(C = 300, # theta Numeric vector. Proportion of truely expressed transctripts #' @importFrom MCMCprecision fit_dirichlet .cDCalcEMDecontamination <- function(counts, - phi, - eta, - theta, - z, - K, - delta) { - ## Notes: use fix-point iteration to update prior for theta, no need - ## to feed delta anymore - logPr <- log(t(phi)[z, ] + 1e-20) + log(theta + 1e-20) - logPc <- log(t(eta)[z, ] + 1e-20) + log(1 - theta + 1e-20) + phi, + eta, + theta, + z, + K, + delta) { + ## Notes: use fix-point iteration to update prior for theta, no need + ## to feed delta anymore + + logPr <- log(t(phi)[z, ] + 1e-20) + log(theta + 1e-20) + logPc <- log(t(eta)[z, ] + 1e-20) + log(1 - theta + 1e-20) + Pr.e <- exp(logPr) + Pc.e <- exp(logPc) + Pr <- Pr.e / (Pr.e + Pc.e) + + estRmat <- t(Pr) * counts + rnGByK <- .colSumByGroupNumeric(estRmat, z, K) + cnGByK <- rowSums(rnGByK) - rnGByK + + counts.cs <- colSums(counts) + estRmat.cs <- colSums(estRmat) + estRmat.cs.n <- estRmat.cs / counts.cs + estCmat.cs.n <- 1 - estRmat.cs.n + temp <- cbind(estRmat.cs.n, estCmat.cs.n) + deltaV2 <- MCMCprecision::fit_dirichlet(temp)$alpha + + ## Update parameters + theta <- + (estRmat.cs + deltaV2[1]) / (counts.cs + sum(deltaV2)) + phi <- normalizeCounts(rnGByK, + normalize = "proportion", + pseudocountNormalize = 1e-20 + ) + eta <- normalizeCounts(cnGByK, + normalize = "proportion", + pseudocountNormalize = 1e-20 + ) + + return(list( + "estRmat" = estRmat, + "theta" = theta, + "phi" = phi, + "eta" = eta, + "delta" = deltaV2 + )) +} + +# DEPRECATED. This is not used, but is kept as it might be useful in the +# feature. +# This function updates decontamination using background distribution +.cDCalcEMbgDecontamination <- + function(counts, globalZ, cbZ, trZ, phi, eta, theta) { + logPr <- log(t(phi)[cbZ, ] + 1e-20) + log(theta + 1e-20) + logPc <- + log(t(eta)[globalZ, ] + 1e-20) + log(1 - theta + 1e-20) Pr <- exp(logPr) / (exp(logPr) + exp(logPc)) Pc <- 1 - Pr deltaV2 <- - MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha + MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha estRmat <- t(Pr) * counts - rnGByK <- .colSumByGroupNumeric(estRmat, z, K) - cnGByK <- rowSums(rnGByK) - rnGByK - - ## Update parameters + phiUnnormalized <- + .colSumByGroupNumeric(estRmat, cbZ, max(cbZ)) + etaUnnormalized <- + rowSums(phiUnnormalized) - .colSumByGroupNumeric( + phiUnnormalized, + trZ, max(trZ) + ) + + ## Update paramters theta <- - (colSums(estRmat) + deltaV2[1]) / (colSums(counts) + sum(deltaV2)) - phi <- normalizeCounts(rnGByK, + (colSums(estRmat) + deltaV2[1]) / (colSums(counts) + sum(deltaV2)) + phi <- + normalizeCounts(phiUnnormalized, normalize = "proportion", - pseudocountNormalize = 1e-20) - eta <- normalizeCounts(cnGByK, + pseudocountNormalize = 1e-20 + ) + eta <- + normalizeCounts(etaUnnormalized, normalize = "proportion", - pseudocountNormalize = 1e-20) + pseudocountNormalize = 1e-20 + ) return(list( - "estRmat" = estRmat, - "theta" = theta, - "phi" = phi, - "eta" = eta, - "delta" = deltaV2 + "estRmat" = estRmat, + "theta" = theta, + "phi" = phi, + "eta" = eta, + "delta" = deltaV2 )) -} + } -# This function updates decontamination using background distribution -.cDCalcEMbgDecontamination <- - function(counts, globalZ, cbZ, trZ, phi, eta, theta) { - logPr <- log(t(phi)[cbZ, ] + 1e-20) + log(theta + 1e-20) - logPc <- - log(t(eta)[globalZ, ] + 1e-20) + log(1 - theta + 1e-20) - - Pr <- exp(logPr) / (exp(logPr) + exp(logPc)) - Pc <- 1 - Pr - deltaV2 <- - MCMCprecision::fit_dirichlet(matrix(c(Pr, Pc), ncol = 2))$alpha - - estRmat <- t(Pr) * counts - phiUnnormalized <- - .colSumByGroupNumeric(estRmat, cbZ, max(cbZ)) - etaUnnormalized <- - rowSums(phiUnnormalized) - .colSumByGroupNumeric(phiUnnormalized, - trZ, max(trZ)) - - ## Update paramters - theta <- - (colSums(estRmat) + deltaV2[1]) / (colSums(counts) + sum(deltaV2)) - phi <- - normalizeCounts(phiUnnormalized, - normalize = "proportion", - pseudocountNormalize = 1e-20) - eta <- - normalizeCounts(etaUnnormalized, - normalize = "proportion", - pseudocountNormalize = 1e-20) - - return(list( - "estRmat" = estRmat, - "theta" = theta, - "phi" = phi, - "eta" = eta, - "delta" = deltaV2 - )) - } -#' @title Decontaminate count matrix -#' @description This function updates decontamination on dataset with multiple -#' batches. -#' @param counts Numeric/Integer matrix. Observed count matrix, rows represent -#' features and columns represent cells. -#' @param z Integer vector. Cell population labels. Default NULL. -#' @param batch Integer vector. Cell batch labels. Default NULL. -#' @param maxIter Integer. Maximum iterations of EM algorithm. Default to be -#' 200. -#' @param delta Numeric. Symmetric concentration parameter for Theta. Default -#' to be 10. -#' @param logfile Character. Messages will be redirected to a file named -#' `logfile`. If NULL, messages will be printed to stdout. Default NULL. -#' @param verbose Logical. Whether to print log messages. Default TRUE. -#' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, -#' a default value of 12345 is used. If NULL, no calls to -#' \link[withr]{with_seed} are made. -#' @return A list object which contains the decontaminated count matrix and -#' related parameters. -#' @examples -#' data(contaminationSim) -#' deconC <- decontX( -#' counts = contaminationSim$rmat + contaminationSim$cmat, -#' z = contaminationSim$z, maxIter = 3 -#' ) -#' deconBg <- decontX( -#' counts = contaminationSim$rmat + contaminationSim$cmat, -#' maxIter = 3 -#' ) -#' @export -decontX <- function(counts, - z = NULL, - batch = NULL, - maxIter = 200, - delta = 10, - logfile = NULL, - verbose = TRUE, - seed = 12345) { - - if (is.null(seed)) { - res <- .decontX(counts = counts, - z = z, - batch = batch, - maxIter = maxIter, - delta = delta, - logfile = logfile, - verbose = verbose) - } else { - with_seed(seed, - res <- .decontX(counts = counts, - z = z, - batch = batch, - maxIter = maxIter, - delta = delta, - logfile = logfile, - verbose = verbose)) - } - return(res) +## Make sure provided parameters are the right type and value range +.checkParametersDecon <- function(proportionPrior) { + if (length(proportionPrior) > 1 | any(proportionPrior <= 0)) { + stop("'delta' should be a single positive value.") + } } -.decontX <- function(counts, - z = NULL, - batch = NULL, - maxIter = 200, - delta = 10, - logfile = NULL, - verbose = TRUE) { - if (!is.null(batch)) { - ## Set result lists upfront for all cells from different batches - logLikelihood <- c() - estRmat <- matrix( - NA, - ncol = ncol(counts), - nrow = nrow(counts), - dimnames = list(rownames(counts), colnames(counts)) - ) - theta <- rep(NA, ncol(counts)) - estConp <- rep(NA, ncol(counts)) - - batchIndex <- unique(batch) - - for (bat in batchIndex) { - countsBat <- counts[, batch == bat] - if (!is.null(z)) { - zBat <- z[batch == bat] - } else { - zBat <- z - } - resBat <- .decontXoneBatch( - counts = countsBat, - z = zBat, - batch = bat, - maxIter = maxIter, - delta = delta, - logfile = logfile, - verbose = verbose - ) - - estRmat[, batch == bat] <- - resBat$resList$estNativeCounts - estConp[batch == bat] <- resBat$resList$estConp - theta[batch == bat] <- resBat$resList$theta - - if (is.null(logLikelihood)) { - logLikelihood <- resBat$resList$logLikelihood - } else { - logLikelihood <- addLogLikelihood(logLikelihood, - resBat$resList$logLikelihood) - } - } - - runParams <- resBat$runParams - method <- resBat$method - resList <- list( - "logLikelihood" = logLikelihood, - "estNativeCounts" = estRmat, - "estConp" = estConp, - "theta" = theta - ) +## Make sure provided count matrix is the right type +.checkCountsDecon <- function(counts) { + if (sum(is.na(counts)) > 0) { + stop("Missing value in 'counts' matrix.") + } + if (is.null(dim(counts))) { + stop("At least 2 genes need to have non-zero expressions.") + } +} - return(list( - "runParams" = runParams, - "resList" = resList, - "method" = method - )) - } - return( - .decontXoneBatch( - counts = counts, - z = z, - maxIter = maxIter, - delta = delta, - logfile = logfile, - verbose = verbose - ) +## Make sure provided cell labels are the right type +#' @importFrom plyr mapvalues +.processCellLabels <- function(z, numCells) { + if (length(z) != numCells) { + stop( + "'z' must be of the same length as the number of cells in the", + " 'counts' matrix." ) + } + if (length(unique(z)) < 2) { + stop( + "No need to decontaminate when only one cluster", + " is in the dataset." + ) # Even though + # everything runs smoothly when length(unique(z)) == 1, result is not + # trustful + } + if (!is.factor(z)) { + z <- plyr::mapvalues(z, unique(z), seq(length(unique(z)))) + z <- as.factor(z) + } + return(z) } -# This function updates decontamination for one batch -.decontXoneBatch <- function(counts, - z = NULL, - batch = NULL, - maxIter = 200, - delta = 10, - logfile = NULL, - verbose = TRUE) { - .checkCountsDecon(counts) - .checkParametersDecon(proportionPrior = delta) - - # nG <- nrow(counts) - nC <- ncol(counts) - K <- length(unique(z)) - - if (is.null(z)) { - deconMethod <- "background" - } else { - deconMethod <- "clustering" - z <- .processCellLabels(z, numCells = nC) - } - - iter <- 1L - numIterWithoutImprovement <- 0L - stopIter <- 3L - - .logMessages( - paste(rep("-", 50), collapse = ""), - logfile = logfile, - append = TRUE, - verbose = verbose - ) - .logMessages( - "Start DecontX. Decontamination", - logfile = logfile, - append = TRUE, - verbose = verbose - ) +## Add two (veried-length) vectors of logLikelihood +addLogLikelihood <- function(llA, llB) { + lengthA <- length(llA) + lengthB <- length(llB) + + if (lengthA >= lengthB) { + llB <- c(llB, rep(llB[lengthB], lengthA - lengthB)) + ll <- llA + llB + } else { + llA <- c(llA, rep(llA[lengthA], lengthB - lengthA)) + ll <- llA + llB + } + + return(ll) +} - if (!is.null(batch)) { - .logMessages( - "batch: ", - batch, - logfile = logfile, - append = TRUE, - verbose = verbose - ) - } - .logMessages( - paste(rep("-", 50), collapse = ""), - logfile = logfile, - append = TRUE, - verbose = verbose - ) - startTime <- Sys.time() - - if (deconMethod == "clustering") { - ## Initialization - deltaInit <- delta - # theta = stats::runif(nC, min = 0.1, max = 0.5) - theta <- stats::rbeta(n = nC, - shape1 = deltaInit, - shape2 = deltaInit) - estRmat <- t(t(counts) * theta) - phi <- .colSumByGroupNumeric(estRmat, z, K) - eta <- rowSums(phi) - phi - phi <- normalizeCounts(phi, - normalize = "proportion", - pseudocountNormalize = 1e-20) - eta <- normalizeCounts(eta, - normalize = "proportion", - pseudocountNormalize = 1e-20) - ll <- c() - - llRound <- .deconCalcLL( - counts = counts, - z = z, - phi = phi, - eta = eta, - theta = theta - ) - ## EM updates - while (iter <= maxIter & - numIterWithoutImprovement <= stopIter) { - nextDecon <- .cDCalcEMDecontamination( - counts = counts, - phi = phi, - eta = eta, - theta = theta, - z = z, - K = K, - delta = delta - ) - - theta <- nextDecon$theta - phi <- nextDecon$phi - eta <- nextDecon$eta - delta <- nextDecon$delta - - ## Calculate log-likelihood - llTemp <- .deconCalcLL( - counts = counts, - z = z, - phi = phi, - eta = eta, - theta = theta - ) - ll <- c(ll, llTemp) - llRound <- c(llRound, round(llTemp, 2)) - - if (round(llTemp, 2) > llRound[iter] | iter == 1) { - numIterWithoutImprovement <- 1L - } else { - numIterWithoutImprovement <- numIterWithoutImprovement + 1L - } - iter <- iter + 1L - } +## Initialization of cell labels for DecontX when they are not given +.decontxInitializeZ <- + function(object, # object is either a sce object or a count matrix + varGenes = 5000, + L = 50, + dbscanEps = 1.0, + verbose = TRUE, + seed = 12345, + logfile = NULL) { + if (!is(object, "SingleCellExperiment")) { + sce <- SingleCellExperiment::SingleCellExperiment( + assays = + list(counts = object) + ) } - if (deconMethod == "background") { - ## Initialize cell label - initialLabel <- .decontxInitializeZ(counts = counts) - globalZ <- initialLabel$globalZ - cbZ <- initialLabel$cbZ - trZ <- initialLabel$trZ - - ## Initialization - deltaInit <- delta - theta <- - stats::rbeta(n = nC, - shape1 = deltaInit, - shape2 = deltaInit) - estRmat <- t(t(counts) * theta) - - phi <- .colSumByGroupNumeric(estRmat, cbZ, max(cbZ)) - eta <- - rowSums(phi) - .colSumByGroupNumeric(phi, trZ, max(trZ)) - phi <- - normalizeCounts(phi, - normalize = "proportion", - pseudocountNormalize = 1e-20) - eta <- - normalizeCounts(eta, - normalize = "proportion", - pseudocountNormalize = 1e-20) - - ll <- c() - - llRound <- .bgCalcLL( - counts = counts, - globalZ = globalZ, - cbZ = cbZ, - phi = phi, - eta = eta, - theta = theta - ) + sce <- scater::logNormCounts(sce, log = TRUE) + #sce <- scater::normalize(sce) - ## EM updates - while (iter <= maxIter & - numIterWithoutImprovement <= stopIter) { - nextDecon <- .cDCalcEMbgDecontamination( - counts = counts, - globalZ = globalZ, - cbZ = cbZ, - trZ = trZ, - phi = phi, - eta = eta, - theta = theta - ) - - theta <- nextDecon$theta - phi <- nextDecon$phi - eta <- nextDecon$eta - delta <- nextDecon$delta - - ## Calculate log-likelihood - llTemp <- - .bgCalcLL( - counts = counts, - globalZ = globalZ, - cbZ = cbZ, - phi = phi, - eta = eta, - theta = theta - ) - ll <- c(ll, llTemp) - llRound <- c(llRound, round(llTemp, 2)) - - if (round(llTemp, 2) > llRound[iter] | iter == 1) { - numIterWithoutImprovement <- 1L - } else { - numIterWithoutImprovement <- numIterWithoutImprovement + 1L - } - iter <- iter + 1L - } + if (nrow(sce) <= varGenes) { + topVariableGenes <- seq_len(nrow(sce)) + } else if (nrow(sce) > varGenes) { + sce.var <- scran::modelGeneVar(sce) + topVariableGenes <- order(sce.var$bio, + decreasing = TRUE + )[seq(varGenes)] } + countsFiltered <- as.matrix(SingleCellExperiment::counts( + sce[topVariableGenes, ] + )) + storage.mode(countsFiltered) <- "integer" - resConp <- 1 - colSums(nextDecon$estRmat) / colSums(counts) - - endTime <- Sys.time() .logMessages( - paste(rep("-", 50), collapse = ""), - logfile = logfile, - append = TRUE, - verbose = verbose + date(), + "...... Collapsing features into", + L, + "modules", + logfile = logfile, + append = TRUE, + verbose = verbose ) + ## Celda clustering using recursive module splitting + L <- min(L, nrow(countsFiltered)) + if (is.null(seed)) { + initialModuleSplit <- recursiveSplitModule(countsFiltered, + initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) + } else { + with_seed(seed, initialModuleSplit <- recursiveSplitModule(countsFiltered, + initialL = L, maxL = L, perplexity = FALSE, verbose = FALSE) + )} + initialModel <- subsetCeldaList(initialModuleSplit, list(L = L)) + .logMessages( - "Completed DecontX. Total time:", - format(difftime(endTime, startTime)), - logfile = logfile, - append = TRUE, - verbose = verbose + date(), + "...... Reducing dimensionality with UMAP", + logfile = logfile, + append = TRUE, + verbose = verbose ) - if (!is.null(batch)) { - .logMessages( - "batch: ", - batch, - logfile = logfile, - append = TRUE, - verbose = verbose - ) - } - .logMessages( - paste(rep("-", 50), collapse = ""), - logfile = logfile, - append = TRUE, - verbose = verbose + ## Louvan graph-based method to reduce dimension into 2 cluster + nNeighbors <- min(15, ncol(countsFiltered)) + # resUmap <- uwot::umap(t(sqrt(fm)), n_neighbors = nNeighbors, + # min_dist = 0.01, spread = 1) + # rm(fm) + resUmap <- celdaUmap(countsFiltered, initialModel, + minDist = 0.01, spread = 1, nNeighbors = nNeighbors, seed = seed ) - runParams <- list("deltaInit" = deltaInit, - "iteration" = iter - 1L) - - resList <- list( - "logLikelihood" = ll, - "estNativeCounts" = nextDecon$estRmat, - "estConp" = resConp, - "theta" = theta, - "delta" = delta + .logMessages( + date(), + " ...... Determining cell clusters with DBSCAN (Eps=", + dbscanEps, + ")", + sep = "", + logfile = logfile, + append = TRUE, + verbose = verbose ) - # if( deconMethod=="clustering" ) { - # posterior.params = list( "est.GeneDist"=phi, "est.ConDist"=eta ) - # resList = append( resList , posterior.params ) - # } + # Use dbSCAN on the UMAP to identify broad cell types + totalClusters <- 1 + while (totalClusters <= 1 & dbscanEps > 0) { + resDbscan <- dbscan::dbscan(resUmap, dbscanEps) + dbscanEps <- dbscanEps - (0.25 * dbscanEps) + totalClusters <- length(unique(resDbscan$cluster)) + } return(list( - "runParams" = runParams, - "resList" = resList, - "method" = deconMethod + "z" = resDbscan$cluster, + "umap" = resUmap )) -} + } -## Make sure provided parameters are the right type and value range -.checkParametersDecon <- function(proportionPrior) { - if (length(proportionPrior) > 1 | any(proportionPrior <= 0)) { - stop("'delta' should be a single positive value.") +## process varGenes +.processvarGenes <- function(varGenes) { + if (is.null(varGenes)) { + varGenes <- 5000 + } else { + if (varGenes < 2 | length(varGenes) > 1) { + stop("Parameter 'varGenes' must be an integer larger than 1.") } + } + return(varGenes) } - -## Make sure provided count matrix is the right type -.checkCountsDecon <- function(counts) { - if (sum(is.na(counts)) > 0) { - stop("Missing value in 'counts' matrix.") +## process dbscanEps for resolusion threshold using DBSCAN +.processdbscanEps <- function(dbscanEps) { + if (is.null(dbscanEps)) { + dbscanEps <- 1 + } else { + if (dbscanEps < 0) { + stop("Parameter 'dbscanEps' needs to be non-negative.") } + } + return(dbscanEps) } - -## Make sure provided cell labels are the right type -#' @importFrom plyr mapvalues -.processCellLabels <- function(z, numCells) { - if (length(z) != numCells) { - stop("'z' must be of the same length as the number of cells in the", - " 'counts' matrix.") - } - if (length(unique(z)) < 2) { - stop("'z' must have at least 2 different values.") # Even though - # everything runs smoothly when length(unique(z)) == 1, result is not - # trustful - } - if (!is.factor(z)) { - z <- plyr::mapvalues(z, unique(z), seq(length(unique(z)))) - z <- as.factor(z) +## process gene modules L +.processL <- function(L) { + if (is.null(L)) { + L <- 50 + } else { + if (L < 2 | length(L) > 1) { + stop("Parameter 'L' must be an integer larger than 1.") } - return(z) + } + return(L) } -## Add two (veried-length) vectors of logLikelihood -addLogLikelihood <- function(llA, llB) { - lengthA <- length(llA) - lengthB <- length(llB) - if (lengthA >= lengthB) { - llB <- c(llB, rep(llB[lengthB], lengthA - lengthB)) - ll <- llA + llB - } else { - llA <- c(llA, rep(llA[lengthA], lengthB - lengthA)) - ll <- llA + llB - } - - return(ll) -} - - - -## Initialization of cell labels for DecontX when they are not given -.decontxInitializeZ <- - function(counts, - K = 10, - minCell = 3, - seed = 428) { - nC <- ncol(counts) - if (nC < 100) { - K <- ceiling(sqrt(nC)) - } - - globalZ <- - .initializeSplitZ( - counts, - K = K, - KSubcluster = NULL, - alpha = 1, - beta = 1, - minCell = 3 - ) - globalK <- max(globalZ) - - localZ <- rep(NA, nC) - for (k in 1:globalK) { - if (sum(globalZ == k) > 2) { - localCounts <- counts[, globalZ == k] - localK <- min(K, ceiling(sqrt(ncol( - localCounts - )))) - localZ[globalZ == k] <- .initializeSplitZ( - localCounts, - K = localK, - KSubcluster = NULL, - alpha = 1, - beta = 1, - minCell = 3 - ) - } else { - localZ [globalZ == k] <- 1L - } - } +######################### +# Simulating Data # +######################### +#' @title Simulate contaminated count matrix +#' @description This function generates a list containing two count matrices -- +#' one for real expression, the other one for contamination, as well as other +#' parameters used in the simulation which can be useful for running +#' decontamination. +#' @param C Integer. Number of cells to be simulated. Default to be 300. +#' @param G Integer. Number of genes to be simulated. Default to be 100. +#' @param K Integer. Number of cell populations to be simulated. Default to be +#' 3. +#' @param NRange Integer vector. A vector of length 2 that specifies the lower +#' and upper bounds of the number of counts generated for each cell. Default to +#' be c(500, 1000). +#' @param beta Numeric. Concentration parameter for Phi. Default to be 0.5. +#' @param delta Numeric or Numeric vector. Concentration parameter for Theta. If +#' input as a single numeric value, symmetric values for beta distribution are +#' specified; if input as a vector of lenght 2, the two values will be the +#' shape1 and shape2 paramters of the beta distribution respectively. +#' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility, +#' a default value of 12345 is used. If NULL, no calls to +#' \link[withr]{with_seed} are made. +#' @return A list object containing the real expression matrix and contamination +#' expression matrix as well as other parameters used in the simulation. +#' @examples +#' contaminationSim <- simulateContaminatedMatrix(K = 3, delta = c(1, 9)) +#' contaminationSim <- simulateContaminatedMatrix(K = 3, delta = 1) +#' @export +simulateContaminatedMatrix <- function(C = 300, + G = 100, + K = 3, + NRange = c(500, 1000), + beta = 0.5, + delta = c(1, 2), + seed = 12345) { + if (is.null(seed)) { + res <- .simulateContaminatedMatrix( + C = C, + G = G, + K = K, + NRange = NRange, + beta = beta, + delta = delta + ) + } else { + with_seed( + seed, + res <- .simulateContaminatedMatrix( + C = C, + G = G, + K = K, + NRange = NRange, + beta = beta, + delta = delta + ) + ) + } - cbZ <- - interaction(globalZ, localZ, lex.order = TRUE, drop = TRUE) - # combined z label - trZ <- - as.integer(sub("\\..*", "", levels(cbZ), perl = TRUE)) - # transitional z label - cbZ <- - as.integer(plyr::mapvalues(cbZ, from = levels(cbZ), - to = 1:length(levels(cbZ)))) + return(res) +} - return(list( - "globalZ" = globalZ, - "localZ" = localZ, - "trZ" = trZ, - "cbZ" = cbZ - )) - } +.simulateContaminatedMatrix <- function(C = 300, + G = 100, + K = 3, + NRange = c(500, 1000), + beta = 0.5, + delta = c(1, 2)) { + if (length(delta) == 1) { + cpByC <- stats::rbeta( + n = C, + shape1 = delta, + shape2 = delta + ) + } else { + cpByC <- stats::rbeta( + n = C, + shape1 = delta[1], + shape2 = delta[2] + ) + } + + z <- sample(seq(K), size = C, replace = TRUE) + if (length(unique(z)) < K) { + warning( + "Only ", + length(unique(z)), + " clusters are simulated. Try to increase numebr of cells 'C' if", + " more clusters are needed" + ) + K <- length(unique(z)) + z <- plyr::mapvalues(z, unique(z), seq(length(unique(z)))) + } + + NbyC <- sample(seq(min(NRange), max(NRange)), + size = C, + replace = TRUE + ) + cNbyC <- vapply(seq(C), function(i) { + stats::rbinom( + n = 1, + size = NbyC[i], + p = cpByC[i] + ) + }, integer(1)) + rNbyC <- NbyC - cNbyC + + phi <- .rdirichlet(K, rep(beta, G)) + + ## sample real expressed count matrix + cellRmat <- vapply(seq(C), function(i) { + stats::rmultinom(1, size = rNbyC[i], prob = phi[z[i], ]) + }, integer(G)) + + rownames(cellRmat) <- paste0("Gene_", seq(G)) + colnames(cellRmat) <- paste0("Cell_", seq(C)) + + ## sample contamination count matrix + nGByK <- + rowSums(cellRmat) - .colSumByGroup(cellRmat, group = z, K = K) + eta <- normalizeCounts(counts = nGByK, normalize = "proportion") + + cellCmat <- vapply(seq(C), function(i) { + stats::rmultinom(1, size = cNbyC[i], prob = eta[, z[i]]) + }, integer(G)) + cellOmat <- cellRmat + cellCmat + + rownames(cellOmat) <- paste0("Gene_", seq(G)) + colnames(cellOmat) <- paste0("Cell_", seq(C)) + + return( + list( + "nativeCounts" = cellRmat, + "observedCounts" = cellOmat, + "NByC" = NbyC, + "z" = z, + "eta" = eta, + "phi" = t(phi) + ) + ) +} diff --git a/R/feature_selection.R b/R/feature_selection.R index 1f9cac4b..f4e5a140 100755 --- a/R/feature_selection.R +++ b/R/feature_selection.R @@ -9,7 +9,7 @@ #' columns. Default 2. #' @param threshold Numeric. Only return ranked rows or columns in the matrix #' that are above this threshold. If NULL, then no threshold will be applied. -#' Default 1. +#' Default 0. #' @param decreasing Logical. Specifies if the rank should be decreasing. #' Default TRUE. #' @return List. The `index` variable provides the top `n` row (feature) indices diff --git a/R/findMarkers.R b/R/findMarkers.R new file mode 100644 index 00000000..8d5a286c --- /dev/null +++ b/R/findMarkers.R @@ -0,0 +1,346 @@ +#' @title Generate decision tree from single-cell clustering output. +#' @description Uses decision tree procudure to generate a set of rules for each +#' cell cluster defined by a single-cell clustering. Splits are determined by +#' one of two metrics at each split: a one-off metric to determine rules for +#' identifying clusters by a single feature, and a balanced metric to determine +#' rules for identifying sets of similar clusters. +#' @param features A L(features) by N(samples) numeric matrix. +#' @param class A vector of K label assignemnts. +#' @param cellTypes List where each element is a cell type and all the clusters +#' within that cell type (i.e. subtypes). +#' @param oneoffMetric A character string. What one-off metric to run, either +#' `modified F1` or `pairwise AUC`. +#' @param threshold A numeric value. The threshold for the oneoff metric to use +#' between 0 and 1, 0.95 by default. Smaller values will result is more one-off +#' splits. +#' @param reuseFeatures Logical. Whether or not a feature can be used more than +#' once on the same cluster. Default is TRUE. +#' @param altSplit Logical. Whether or not to force a marker for clusters that +#' are solely defined by the absence of markers. Defulault is TRUE +#' @param consecutiveOneoff Logical. Whether or not to allow one-off splits at +#' consecutive brances. Default it TRUE +#' @return A named list with five elements. +#' \itemize{ +#' \item rules - A named list with one `data.frame` for every label. Each +#' `data.frame` has five columns and gives the set of rules for disinguishing +#' each label. +#' \itemize{ +#' \item feature - Feature identifier. +#' \item direction - Relationship to feature value, -1 if less than, 1 if +#' greater than. +#' \item value - The feature value which defines the decision boundary +#' \item stat - The performance value returned by the splitting metric for +#' this split. +#' \item statUsed - Which performance metric was used. "IG" if information +#' gain and "OO" if one-off. +#' \item level - The level of the tree at which is rule was defined. 1 is the +#' level of the first split of the tree. +#' } +#' \item dendro - A dendrogram object of the decision tree output +#' \item summaryMatrix - A K(labels) by L(features) matrix representation of +#' the decision rules. Columns denote features and rows denote labels. Non-0 +#' values denote instances where a feature was used on a given label. Positive +#' and negative values denote whether the values of the label for that feature +#' were greater than or less than the decision threshold, respectively. The +#' magnitude of Non-0 values denote the level at which the feature was used, +#' where the first split has a magnitude of 1. Note, if reuse_features = TRUE, +#' only the final usage of a feature for a given label is shown. +#' \item prediction - A character vector of label of predictions of the +#' training data using the final model. "MISSING" if label prediction was +#' ambiguous. +#' \item performance - A named list denoting the training performance of the +#' model. +#' \itemize{ +#' \item accuracy - (number correct/number of samples) for the whole set of +#' samples. +#' \item balAcc - mean sensitivity across all labels +#' \item meanPrecision - mean precision across all labels +#' \item correct - the number of correct predictions of each label +#' \item sizes - the number of actual counts of each label +#' \item sensitivity - the sensitivity of the prediciton of each label. +#' \item precision - the precision of the prediciton of each label. +#' } +#' } +#' @examples +#' library(M3DExampleData) +#' counts <- M3DExampleData::Mmus_example_list$data +#' # subset 100 genes for fast clustering +#' counts <- as.matrix(counts[1500:2000, ]) +#' # cluster genes into 10 modules for quick demo +#' cm <- celda_CG(counts = counts, L = 10, K = 5, verbose = FALSE) +#' # Get features matrix and cluster assignments +#' factorized <- factorizeMatrix(counts, cm) +#' features <- factorized$proportions$cell +#' class <- clusters(cm)$z +#' # Generate Decision Tree +#' DecTree <- findMarkers(features, +#' class, +#' oneoffMetric = "modified F1", +#' threshold = 1, +#' consecutiveOneoff = FALSE) +#' +#' # Plot dendrogram +#' plotDendro(DecTree) +#' @import magrittr +#' @importFrom methods hasArg +#' @export +findMarkers <- function(features, + class, + cellTypes, + oneoffMetric = c("modified F1", "pairwise AUC"), + threshold = 0.95, + reuseFeatures = FALSE, + altSplit = TRUE, + consecutiveOneoff = TRUE) { + + if (ncol(features) != length(class)) { + stop("Number of columns of features must equal length of class") + } + + if (any(is.na(class))) { + stop("NA class values") + } + + if (any(is.na(features))) { + stop("NA feature values") + } + + # Match the oneoffMetric argument + oneoffMetric <- match.arg(oneoffMetric) + + # Transpose features + features <- t(features) + + # If no detailed cell types are provided + if (!methods::hasArg(cellTypes)) { + + message("Building tree...") + + # Set class to factor + class <- as.factor(class) + + # Generate list of tree levels + tree <- .generateTreeList( + features, + class, + oneoffMetric, + threshold, + reuseFeatures, + consecutiveOneoff) + + # Add alternative node for the solely down-regulated leaf + if (altSplit) { + tree <- .addAlternativeSplit(tree, features, class) + } + + message("Computing performance metrics...") + + # Format tree output for plotting and generate summary statistics + DTsummary <- .summarizeTree(tree, features, class) + + return(DTsummary) + } else { + # If detailed cell types are provided + + # Check that cell types match class labels + if (mean(unlist(cellTypes) %in% unique(class)) != 1) { + stop("Provided cell types do not match class labels. ", + "Please check the 'cellTypes' argument.") + } + + # Create vector with cell type class labels + newLabels <- class + for (i in names(cellTypes)) { + newLabels[newLabels %in% cellTypes[[i]]] <- i + } + + # Find which cell types have more than one cluster + largeCellTypes <- names(cellTypes[lengths(cellTypes) > 1]) + + # Update cell subtype labels + subtypeLabels <- newLabels + subtypeLabels[subtypeLabels %in% largeCellTypes] <- paste0( + subtypeLabels[subtypeLabels %in% largeCellTypes], + "(", + class[subtypeLabels %in% largeCellTypes], + ")" + ) + + # Create tree for cell types + message("Building tree for all cell types...") + tree <- .generateTreeList(features, as.factor(newLabels), oneoffMetric, + threshold, reuseFeatures, consecutiveOneoff) + tree <- list(rules = .mapClass2features(tree, features, + as.factor(newLabels))$rules, + dendro = .convertToDendrogram(tree, as.factor(newLabels))) + + # Store tree's dendrogram in a separate variable + dendro <- tree$dendro + + # Create separate trees for each cell type with more than one cluster + newTrees <- lapply(largeCellTypes, function(cellType) { + + # Print current status + message("Building tree for cell type ", cellType) + + # Remove used features + featUse <- colnames(features) + if (!reuseFeatures) { + featUse <- featUse[!featUse %in% tree$rules[[cellType]]$feature] + } + + # Create new tree + newTree <- .generateTreeList(features[newLabels == cellType, + featUse], + as.factor(subtypeLabels[newLabels == cellType]), + oneoffMetric, threshold, + reuseFeatures, consecutiveOneoff) + newTree <- list(rules = .mapClass2features(newTree, + features[newLabels == cellType, ], + as.factor(subtypeLabels[ + newLabels == cellType]))$rules, + dendro = .convertToDendrogram(newTree, + as.factor(subtypeLabels[ + newLabels == cellType]))) + + # Adjust 'rules' table for new tree + newTree$rules <- lapply(newTree$rules, function(rules) { + rules$level <- rules$level + max(tree$rules[[cellType]]$level) + rules <- rbind(tree$rules[[cellType]], rules) + }) + + return(newTree) + }) + names(newTrees) <- largeCellTypes + + # Fix max depth in original tree + maxDepth <- max(unlist(lapply(newTrees, function(newTree) { + lapply(newTree$rules, function(ruleDF) { + ruleDF$level + }) + }))) + addDepth <- maxDepth - attributes(dendro)$height + + dendro <- dendrapply(dendro, function(node, addDepth) { + if (attributes(node)$height > 1) { + attributes(node)$height <- attributes(node)$height + + addDepth + 1 + } + return(node) + }, addDepth) + + # Find indices of cell type nodes in tree + indices <- lapply(largeCellTypes, + function(cellType) { + # Initialize sub trees, indices string, and flag + dendSub <- dendro + index <- "" + flag <- TRUE + + while (flag) { + # Get the edge with the class of interest + whEdge <- which(unlist(lapply(dendSub, + function(edge) + cellType %in% + attributes(edge)$classLabels))) + + # Add this as a string + index <- paste0(index, "[[", whEdge, "]]") + + # Move to this branch + dendSub <- eval(parse(text = + paste0("dendro", index))) + + # Is this the only class in that branch + flag <- length( + attributes(dendSub)$classLabels) > 1 + } + + return(index) + } + ) + names(indices) <- largeCellTypes + + # Add each cell type tree + for (cellType in largeCellTypes) { + + # Get current tree + cellTypeDendro <- newTrees[[cellType]]$dendro + + # Adjust labels, member count, and midpoint of nodes + dendro <- dendrapply(dendro, function(node) { + # Check if in right branch + if (cellType %in% as.character(attributes(node)$classLabels)) { + # Replace cell type label with subtype labels + attributes(node)$classLabels <- + as.character(attributes(node)$classLabels) %>% + .[. != cellType] %>% + c(., unique(subtypeLabels)[grep(cellType, + unique(subtypeLabels))]) + + # Assign new member count for this branch + attributes(node)$members <- + length(attributes(node)$classLabels) + + # Assign new midpoint for this branch + attributes(node)$midpoint <- + (attributes(node)$members - 1) / 2 + } + return(node) + }) + + # Replace label at new tree's branch point + branchPointAttr <- attributes(eval(parse( + text = paste0("dendro", indices[[cellType]])))) + branchPointLabel <- branchPointAttr$label + branchPointStatUsed <- branchPointAttr$statUsed + + if (!is.null(branchPointLabel)) { + attributes(cellTypeDendro)$label <- branchPointLabel + attributes(cellTypeDendro)$statUsed <- branchPointStatUsed + } + + # Fix height + indLoc <- gregexpr("\\[\\[", indices[[cellType]])[[1]] + indLoc <- indLoc[length(indLoc)] + parentIndexString <- substr(indices[[cellType]], + 0, + indLoc - 1) + parentHeight <- attributes(eval(parse( + text = paste0("dendro", parentIndexString))))$height + cellTypeHeight <- attributes(cellTypeDendro)$height + cellTypeDendro <- dendrapply(cellTypeDendro, + function(node, parentHeight, cellTypeHeight) { + if (attributes(node)$height > 1) { + attributes(node)$height <- + parentHeight - 1 - (cellTypeHeight - attributes( + node)$height) + } + return(node) + }, parentHeight, cellTypeHeight) + + # Add new tree to original tree + eval(parse(text = paste0( + "dendro", indices[[cellType]], " <- cellTypeDendro"))) + + # Append new tree's 'rules' tables to original tree + tree$rules <- append(tree$rules, newTrees[[cellType]]$rules) + + # Remove old tree's rules + tree$rules <- tree$rules[-which(names(tree$rules) == cellType)] + } + + # Set final tree dendro + tree$dendro <- dendro + + # Get performance statistics + message("Computing performance statistics...") + perfList <- .getPerformance(tree$rules, + features, + as.factor(subtypeLabels)) + tree$prediction <- perfList$prediction + tree$performance <- perfList$performance + + return(tree) + } +} diff --git a/R/getDecisions.R b/R/getDecisions.R index e5611cc6..c6d268a7 100644 --- a/R/getDecisions.R +++ b/R/getDecisions.R @@ -1,8 +1,8 @@ #' @title Gets cluster estimates using rules generated by -#' `celda::buildTreeHybrid` +#' `celda::findMarkers` #' @description Get decisions for a matrix of features. Estimate cell #' cluster membership using feature matrix input. -#' @param rules List object. The `rules` element from `buildTreeHybrid` +#' @param rules List object. The `rules` element from `findMarkers` #' output. Returns NA if cluster estimation was ambiguous. #' @param features A L(features) by N(samples) numeric matrix. #' @return A character vector of label predicitions. @@ -18,7 +18,7 @@ #' features <- factorized$proportions$cell #' class <- clusters(cm)$z #' # Generate Decision Tree -#' DecTree <- buildTreeHybrid(features, +#' DecTree <- findMarkers(features, #' class, #' oneoffMetric = "modified F1", #' threshold = 1, @@ -34,7 +34,7 @@ getDecisions <- function(rules, features) { } # Function to predict class from list of rules -.predictClass <- function(samp, rules){ +.predictClass <- function(samp, rules) { # Initilize possible classes and level classes <- names(rules) @@ -60,7 +60,7 @@ getDecisions <- function(rules, features) { ruleClass$sample <- samp[ruleClass$feature] # For multiple direction == 1, use one with the top stat - if (sum(ruleClass$direction == 1) > 1){ + if (sum(ruleClass$direction == 1) > 1) { ruleClass <- ruleClass[order( ruleClass$direction , decreasing = T), ] diff --git a/R/plotDendro.R b/R/plotDendro.R index f189c59e..4b8880d5 100644 --- a/R/plotDendro.R +++ b/R/plotDendro.R @@ -1,11 +1,13 @@ -#' @title Plots dendrogram of `buildTreeHybrid` output +#' @title Plots dendrogram of `findMarkers` output #' @description Generates a dendrogram of the rules and performance -#' (optional) of the decision tree generates by `buildTreeHybrid`. -#' @param decisionTree List object. The output of `celda::buildTreeHybrid`. +#' (optional) of the decision tree generates by `findMarkers`. +#' @param decisionTree List object. The output of `celda::findMarkers`. #' @param classLabel A character value. The name of a label to which draw #' the path and rules. If NULL (default), the rules for every cluster is shown. #' @param addSensPrec Logical. Print training sensitivities and precisions -#' for each cluster below leaf label? Default is TRUE. +#' for each cluster below leaf label? Default is FALSE. +#' @param maxFeaturePrint A numeric value. Maximum number of feature IDs to +#' print at a given node. Default is 4. #' @param leafSize A numeric value. Size of text below each leaf. Default is 24. #' @param boxSize A numeric value. Size of rule labels. Default is 7. #' @param boxColor A character value. Color of rule labels. Default is `black`. @@ -21,7 +23,7 @@ #' features <- factorized$proportions$cell #' class <- clusters(cm)$z #' # Generate Decision Tree -#' decTree <- buildTreeHybrid(features, +#' decTree <- findMarkers(features, #' class, #' oneoffMetric = "modified F1", #' threshold = 1, @@ -36,7 +38,8 @@ #' @export plotDendro <- function(decisionTree, classLabel = NULL, - addSensPrec = TRUE, + addSensPrec = FALSE, + maxFeaturePrint = 4, leafSize = 24, boxSize = 7, boxColor = "black") { @@ -49,9 +52,9 @@ plotDendro <- function(decisionTree, # Create vector of per class performance perfVec <- paste(performance$sizes, - format(round(performance$sensitivity, 2), nsmall = 2), - format(round(performance$precision, 2), nsmall = 2), - sep = "\n" + format(round(performance$sensitivity, 2), nsmall = 2), + format(round(performance$precision, 2), nsmall = 2), + sep = "\n" ) names(perfVec) <- names(performance$sensitivity) @@ -75,14 +78,23 @@ plotDendro <- function(decisionTree, segs <- as.data.frame(dendextend::get_nodes_xy(dendro)) colnames(segs) <- c("xend", "yend") - # As label and which stat was used - # Labels will stack + # Add labels to nodes segs$label <- gsub(";", "\n", dendextend::get_nodes_attr(dendro, "label")) + segs$label <- sapply(segs$label, function(lab, maxFeaturePrint) { + loc <- gregexpr("\n", lab)[[1]][maxFeaturePrint] + if (!is.na(loc)) { + lab <- substr(lab, 1, loc - 1) + } + return(lab) + }, maxFeaturePrint) + + # Subset for max + segs$statUsed <- dendextend::get_nodes_attr(dendro, "statUsed") # If highlighting a class label, remove non-class specific rules if (!is.null(classLabel)) { - if (!classLabel %in% rownames(decisionTree$summaryMatrix)) { + if (!classLabel %in% names(decisionTree$rules)) { stop("classLabel not a valid class ID.") } dendro <- .highlightClassLabel(dendro, classLabel) @@ -127,6 +139,13 @@ plotDendro <- function(decisionTree, # Add sensitivity and precision measurements if (addSensPrec) { leafLabels <- paste(leafLabels, perfVec[leafLabels], sep = "\n") + leafAngle <- 0 + leafHJust <- 0.5 + leafVJust <- -1 + } else { + leafAngle <- 90 + leafHJust <- 1 + leafVJust <- 0.5 } # Create plot of dendrogram @@ -153,10 +172,12 @@ plotDendro <- function(decisionTree, panel.border = ggplot2::element_blank(), axis.title = ggplot2::element_blank(), axis.ticks = ggplot2::element_blank(), - axis.text.x = ggplot2::element_text(hjust = 0.5, + axis.text.x = ggplot2::element_text( + hjust = leafHJust, + angle = leafAngle, size = leafSize, family = "mono", - vjust = -1), + vjust = leafVJust), axis.text.y = ggplot2::element_blank() )) diff --git a/R/plot_dr.R b/R/plot_dr.R index 77990c8e..530a3887 100755 --- a/R/plot_dr.R +++ b/R/plot_dr.R @@ -17,8 +17,12 @@ #' The color will be used to signify the midpoint on the scale. #' @param colorHigh Character. A color available from `colors()`. #' The color will be used to signify the highest values on the scale. -#' Default 'blue'. +#' Default 'blue'. #' @param varLabel Character vector. Title for the color legend. +#' @param ncol Integer. Passed to \link[ggplot2]{facet_wrap}. Specify the +#' number of columns for facet wrap. +#' @param headers Character vector. If `NULL`, the corresponding rownames are +#' used as labels. Otherwise, these headers are used to label the genes. #' @return The plot as a ggplot object #' @examples #' data(celdaCGSim, celdaCGMod) @@ -45,7 +49,9 @@ plotDimReduceGrid <- function(dim1, colorLow, colorMid, colorHigh, - varLabel) { + varLabel, + ncol = NULL, + headers = NULL) { df <- data.frame(dim1, dim2, t(as.data.frame(matrix))) naIx <- is.na(dim1) | is.na(dim2) @@ -54,24 +60,59 @@ plotDimReduceGrid <- function(dim1, m <- reshape2::melt(df, id.vars = c("dim1", "dim2")) colnames(m) <- c(xlab, ylab, "facet", varLabel) - ggplot2::ggplot(m, - ggplot2::aes_string(x = xlab, y = ylab)) + - ggplot2::geom_point(stat = "identity", - size = size, - ggplot2::aes_string(color = varLabel)) + - ggplot2::facet_wrap(~ facet) + - ggplot2::theme_bw() + - ggplot2::scale_colour_gradient2(low = colorLow, - high = colorHigh, - mid = colorMid, - midpoint = (max(m[, 4]) + min(m[, 4])) / 2, - name = gsub("_", " ", varLabel)) + - ggplot2::theme(strip.background = ggplot2::element_blank(), - panel.grid.major = ggplot2::element_blank(), - panel.grid.minor = ggplot2::element_blank(), - panel.spacing = unit(0, "lines"), - panel.background = ggplot2::element_blank(), - axis.line = ggplot2::element_line(colour = "black")) + if (isFALSE(is.null(headers))) { + names(headers) <- levels(m$facet) + headers <- ggplot2::as_labeller(headers) + + g <- ggplot2::ggplot(m, + ggplot2::aes_string(x = xlab, y = ylab)) + + ggplot2::geom_point(stat = "identity", + size = size, + ggplot2::aes_string(color = varLabel)) + + ggplot2::theme_bw() + + ggplot2::scale_colour_gradient2(low = colorLow, + high = colorHigh, + mid = colorMid, + midpoint = (max(m[, 4]) + min(m[, 4])) / 2, + name = gsub("_", " ", varLabel)) + + ggplot2::theme(strip.background = ggplot2::element_blank(), + panel.grid.major = ggplot2::element_blank(), + panel.grid.minor = ggplot2::element_blank(), + panel.spacing = unit(0, "lines"), + panel.background = ggplot2::element_blank(), + axis.line = ggplot2::element_line(colour = "black")) + if (isFALSE(is.null(ncol))) { + g <- g + ggplot2::facet_wrap(~ facet, labeller = headers, + ncol = ncol) + } else { + g <- g + ggplot2::facet_wrap(~ facet, labeller = headers) + } + } else { + g <- ggplot2::ggplot(m, + ggplot2::aes_string(x = xlab, y = ylab)) + + ggplot2::geom_point(stat = "identity", + size = size, + ggplot2::aes_string(color = varLabel)) + + ggplot2::facet_wrap(~ facet) + + ggplot2::theme_bw() + + ggplot2::scale_colour_gradient2(low = colorLow, + high = colorHigh, + mid = colorMid, + midpoint = (max(m[, 4]) + min(m[, 4])) / 2, + name = gsub("_", " ", varLabel)) + + ggplot2::theme(strip.background = ggplot2::element_blank(), + panel.grid.major = ggplot2::element_blank(), + panel.grid.minor = ggplot2::element_blank(), + panel.spacing = unit(0, "lines"), + panel.background = ggplot2::element_blank(), + axis.line = ggplot2::element_line(colour = "black")) + if (isFALSE(is.null(ncol))) { + g <- g + ggplot2::facet_wrap(~ facet, ncol = ncol) + } else { + g <- g + ggplot2::facet_wrap(~ facet) + } + } + return(g) } @@ -87,6 +128,8 @@ plotDimReduceGrid <- function(dim1, #' @param counts Integer matrix. Rows represent features and columns #' represent cells. #' @param features Character vector. Uses these genes for plotting. +#' @param headers Character vector. If `NULL`, the corresponding rownames are +#' used as labels. Otherwise, these headers are used to label the genes. #' @param normalize Logical. Whether to normalize the columns of `counts`. #' Default TRUE. #' @param exactMatch Logical. Whether an exact match or a partial match using @@ -99,11 +142,13 @@ plotDimReduceGrid <- function(dim1, #' @param xlab Character vector. Label for the x-axis. Default "Dimension_1". #' @param ylab Character vector. Label for the y-axis. Default "Dimension_2". #' @param colorLow Character. A color available from `colors()`. The color -#' will be used to signify the lowest values on the scale. Default 'grey'. +#' will be used to signify the lowest values on the scale. Default 'blue'. #' @param colorMid Character. A color available from `colors()`. The color -#' will be used to signify the midpoint on the scale. +#' will be used to signify the midpoint on the scale. Default 'white'. #' @param colorHigh Character. A color available from `colors()`. The color -#' will be used to signify the highest values on the scale. Default 'blue'. +#' will be used to signify the highest values on the scale. Default 'red'. +#' @param ncol Integer. Passed to \link[ggplot2]{facet_wrap}. Specify the +#' number of columns for facet wrap. #' @return The plot as a ggplot object #' @examples #' \donttest{ @@ -121,15 +166,32 @@ plotDimReduceFeature <- function(dim1, dim2, counts, features, + headers = NULL, normalize = TRUE, exactMatch = TRUE, trim = c(-2, 2), size = 1, xlab = "Dimension_1", ylab = "Dimension_2", - colorLow = "grey", - colorMid = NULL, - colorHigh = "blue") { + colorLow = "blue", + colorMid = "white", + colorHigh = "red", + ncol = NULL) { + + if (isFALSE(is.null(headers))) { + if (length(headers) != length(features)) { + stop("Headers ", + headers, + " should be the same length as features ", + features) + } + + if (isFALSE(exactMatch)) { + warning("exactMatch is FALSE. headers will not be used!") + headers <- NULL + } + } + if (isTRUE(normalize)) { counts <- normalizeCounts(counts, transformationFun = sqrt, @@ -172,7 +234,7 @@ plotDimReduceFeature <- function(dim1, paste(notFound, sep = "", collapse = ",")) - } + } } else { featuresNotFound <- setdiff(features, intersect(features, rownames(counts))) @@ -186,7 +248,11 @@ plotDimReduceFeature <- function(dim1, paste(featuresNotFound, sep = "", collapse = ",")) + if (isFALSE(is.null(headers))) { + whichHeadersNotFound <- which(featuresNotFound == features) + headers <- headers[-whichHeadersNotFound] } + } featuresFound <- setdiff(features, featuresNotFound) counts <- counts[featuresFound, , drop = FALSE] } @@ -199,7 +265,9 @@ plotDimReduceFeature <- function(dim1, colorLow, colorMid, colorHigh, - varLabel) + varLabel, + ncol, + headers) } @@ -232,6 +300,8 @@ plotDimReduceFeature <- function(dim1, #' @param colorHigh Character. A color available from `colors()`. #' The color will be used to signify the highest values on the scale. #' Default 'blue'. +#' @param ncol Integer. Passed to \link[ggplot2]{facet_wrap}. Specify the +#' number of columns for facet wrap. #' @return The plot as a ggplot object #' @examples #' \donttest{ @@ -256,7 +326,8 @@ plotDimReduceModule <- ylab = "Dimension_2", colorLow = "grey", colorMid = NULL, - colorHigh = "blue") { + colorHigh = "blue", + ncol = NULL) { factorized <- factorizeMatrix(celdaMod = celdaMod, counts = counts) @@ -293,7 +364,8 @@ plotDimReduceModule <- colorLow, colorMid, colorHigh, - varLabel) + varLabel, + ncol) } @@ -317,6 +389,8 @@ plotDimReduceModule <- #' If NULL, all clusters will be colored. Default NULL. #' @param labelClusters Logical. Whether the cluster labels are plotted. #' Default FALSE. +#' @param groupBy Character vector. Contains sample labels for each cell. +#' If NULL, all samples will be plotted together. Default NULL. #' @param labelSize Numeric. Sets size of label if labelClusters is TRUE. #' Default 3.5. #' @return The plot as a ggplot object @@ -340,16 +414,26 @@ plotDimReduceCluster <- function(dim1, ylab = "Dimension_2", specificClusters = NULL, labelClusters = FALSE, + groupBy = NULL, labelSize = 3.5) { - df <- data.frame(dim1, dim2, cluster) - colnames(df) <- c(xlab, ylab, "Cluster") + + if (!is.null(groupBy)) { + df <- data.frame(dim1, dim2, cluster, groupBy) + colnames(df) <- c(xlab, ylab, "Cluster", "Sample") + } else { + df <- data.frame(dim1, dim2, cluster) + colnames(df) <- c(xlab, ylab, "Cluster") + } + naIx <- is.na(dim1) | is.na(dim2) df <- df[!naIx, ] df[3] <- as.factor(df[[3]]) clusterColors <- distinctColors(nlevels(as.factor(cluster))) + if (!is.null(specificClusters)) { clusterColors[!levels(df[[3]]) %in% specificClusters] <- "gray92" } + g <- ggplot2::ggplot(df, ggplot2::aes_string(x = xlab, y = ylab)) + ggplot2::geom_point(stat = "identity", size = size, @@ -364,25 +448,32 @@ plotDimReduceCluster <- function(dim1, ggplot2::guide_legend(override.aes = list(size = 1))) if (labelClusters == TRUE) { - centroidList <- lapply(seq(length(unique(cluster))), function(x) { + #centroidList <- lapply(seq(length(unique(cluster))), function(x) { + centroidList <- lapply(unique(cluster), function(x) { df.sub <- df[df$Cluster == x, ] - median.1 <- stats::median(df.sub$Dimension_1) - median.2 <- stats::median(df.sub$Dimension_2) + median.1 <- stats::median(df.sub[, xlab]) + median.2 <- stats::median(df.sub[, ylab]) cbind(median.1, median.2, x) }) centroid <- do.call(rbind, centroidList) - centroid <- as.data.frame(centroid) + centroid <- data.frame(Dimension_1 = as.numeric(centroid[, 1]), + Dimension_2 = as.numeric(centroid[, 2]), + Cluster = centroid[, 3]) - colnames(centroid) <- c("Dimension_1", "Dimension_2", "Cluster") + colnames(centroid)[seq(2)] <- c(xlab, ylab) g <- g + ggplot2::geom_point(data = centroid, - mapping = ggplot2::aes_string(x = "Dimension_1", - y = "Dimension_2"), + mapping = ggplot2::aes_string(x = xlab, + y = ylab), size = 0, alpha = 0) + ggrepel::geom_text_repel(data = centroid, - mapping = ggplot2::aes_string(label = "Cluster"), + mapping = ggplot2::aes(label = Cluster), size = labelSize) } + if (!is.null(x = groupBy)) { + g <- g + facet_wrap(facets = vars(!!sym(x = "Sample"))) + + theme(strip.background = element_blank()) + } return(g) } @@ -393,14 +484,14 @@ plotDimReduceCluster <- function(dim1, # @param maxIter Numeric vector. Determines iterations for tsne. Default 1000. # @param doPca Logical. Whether to perform # dimensionality reduction with PCA before tSNE. -# @param initialDims Integer.Number of dimensions from PCA to use as -# input in tSNE. +# @param initialDims Integer. Number of dimensions from PCA to use as +# input in tSNE. Default 50. #' @importFrom Rtsne Rtsne .calculateTsne <- function(norm, perplexity = 20, maxIter = 2500, doPca = FALSE, - initialDims = 20) { + initialDims = 50) { res <- Rtsne::Rtsne( norm, @@ -415,12 +506,46 @@ plotDimReduceCluster <- function(dim1, } -# Run the umap algorithm for dimensionality reduction +# Run the UMAP algorithm for dimensionality reduction # @param norm Normalized count matrix. -# @param umapConfig An object of class umap.config, -# containing configuration parameters to be passed to umap. -# Default umap::umap.defualts. -#' @importFrom umap umap -.calculateUmap <- function(norm, umapConfig = umap::umap.defaults) { - return(umap::umap(norm, umapConfig)$layout) +# @param nNeighbors The size of local neighborhood used for +# manifold approximation. Larger values result in more global +# views of the manifold, while smaller values result in more +# local data being preserved. Default 30. +# See `?uwot::umap` for more information. +# @param minDist The effective minimum distance between embedded points. +# Smaller values will result in a more clustered/clumped +# embedding where nearby points on the manifold are drawn +# closer together, while larger values will result on a more +# even dispersal of points. Default 0.2. +# See `?uwot::umap` for more information. +# @param spread The effective scale of embedded points. In combination with +# ‘min_dist’, this determines how clustered/clumped the +# embedded points are. Default 1. +# See `?uwot::umap` for more information. +# @param pca Logical. Whether to perform +# dimensionality reduction with PCA before UMAP. +# @param initialDims Integer. Number of dimensions from PCA to use as +# input in UMAP. Default 50. +# @param cores Number of threads to use. Default 1. +# @param ... Other parameters to pass to `uwot::umap`. +#' @import uwot +.calculateUmap <- function(norm, + nNeighbors = 30, + minDist = 0.75, + spread = 1, + pca=FALSE, + initialDims = 50, + cores = 1, + ...) { + if (isTRUE(pca)) { + doPCA <- initialDims + } else { + doPCA <- NULL + } + + res <- uwot::umap(norm, n_neighbors = nNeighbors, + min_dist = minDist, spread = spread, + n_threads = cores, n_sgd_threads = 1, pca = doPCA, ...) + return(res) } diff --git a/R/semi_pheatmap.R b/R/semi_pheatmap.R index d977a828..01e8afa5 100755 --- a/R/semi_pheatmap.R +++ b/R/semi_pheatmap.R @@ -755,7 +755,7 @@ vplayout <- function(x, y) { } # Omit border color if cell size is too small - if (mindim < 3){ + if (mindim < 3) { borderColor <- NA } @@ -898,8 +898,8 @@ vplayout <- function(x, y) { } # Draw annotation legend - annotation <- c(annotationCol[length(annotationCol):1], - annotationRow[length(annotationRow):1]) + annotation <- c(annotationCol[seq(length(annotationCol), 1)], + annotationRow[seq(length(annotationRow), 1)]) annotation <- annotation[unlist(lapply(annotation, function(x) !.is.na2(x)))] @@ -1494,7 +1494,7 @@ vplayout <- function(x, y) { #' clusteringDistanceCols = dcols) #' #' # Modify ordering of the clusters using clustering callback option -#' callback = function(hc, mat){ +#' callback = function(hc, mat) { #' sv = svd(t(mat))$v[, 1] #' dend = reorder(as.dendrogram(hc), wts = sv) #' as.hclust(dend) @@ -1552,6 +1552,8 @@ semiPheatmap <- function(mat, silent = FALSE, rowLabel, colLabel, + rowGroupOrder = NULL, + colGroupOrder = NULL, ...) { # Set labels @@ -1620,7 +1622,8 @@ semiPheatmap <- function(mat, if (is.null(rowLabel)) { rowLabel <- rep(1, nrow(mat)) } else { - o <- order(rowLabel) + #o <- order(rowLabel) + o <- .Order(labels = rowLabel, groupOrder = rowGroupOrder) mat <- mat[o, , drop = FALSE] fmat <- fmat[o, , drop = FALSE] rowLabel <- rowLabel[o] @@ -1654,7 +1657,8 @@ semiPheatmap <- function(mat, if (is.null(colLabel)) { colLabel <- rep(1, ncol(mat)) } else { - o <- order(colLabel) + #o <- order(colLabel) + o <- .Order(labels = colLabel, groupOrder = colGroupOrder) mat <- mat[, o, drop = FALSE] fmat <- fmat[, o, drop = FALSE] colLabel <- colLabel[o] @@ -1781,3 +1785,27 @@ semiPheatmap <- function(mat, treeCol = treeCol, gtable = gt)) } + + + + +# order function that order the row/column labels +# based on the order of the group priority +# return value is a vector of the ordered index +# labels is a vector of any non-zero length +# groupOrder, a column named dataframe/matrix +# with the "groupName" column storing the group +# name and the "groupIndex" storing the group priority +.Order <- function(labels, groupOrder=NULL) { + if (is.null(groupOrder)) { + return(order(labels)) + } else { + # Throw error is length(unique(labels)) != nrow(groupOrder) + olabels <- plyr::mapvalues(x = labels, + from = groupOrder[, "groupName"], + to = groupOrder[, "groupIndex"]) + # Make sure the olabels is integer for order() function + olabels <- as.integer(olabels) + return(order(olabels)) + } +} diff --git a/R/split_clusters.R b/R/split_clusters.R index 676f0d1f..33f02ccc 100644 --- a/R/split_clusters.R +++ b/R/split_clusters.R @@ -31,16 +31,6 @@ } ## Loop through each split-able Z and perform split - clustSplit <- lapply(zToSplit, function(x) { - clusters(.celda_C(counts[, z == x], - K = 2, - zInitialize = "random", - maxIter = 5, - splitOnIter = -1, - splitOnLast = FALSE, - verbose = FALSE))$z - }) - clustSplit <- vector("list", K) for (i in zToSplit) { clustLabel <- .celda_C( diff --git a/R/summarizeTreeHelper.R b/R/summarizeTreeHelper.R index c0c0a125..0a12b48b 100644 --- a/R/summarizeTreeHelper.R +++ b/R/summarizeTreeHelper.R @@ -13,7 +13,6 @@ return(list( rules = class2features$rules, dendro = dendro, - summaryMatrix = class2features$c2fMatrix, prediction = perfList$prediction, performance = perfList$performance )) @@ -265,14 +264,9 @@ subUnderscore <- function(x, n) unlist(lapply( )) } -# Create matrix of classes and features combinations +# Create rules of classes and features sequences .mapClass2features <- function(tree, features, class) { - # Create empty matrix - c2fMatrix <- matrix(0, nrow = length(unique(class)), ncol = ncol(features)) - rownames(c2fMatrix) <- sort(unique(class)) - colnames(c2fMatrix) <- colnames(features) - # Get class to feature indices class2featuresIndices <- do.call(rbind, lapply( seq(length(tree)), @@ -320,14 +314,6 @@ subUnderscore <- function(x, n) unlist(lapply( })) rownames(class2featuresIndices) <- NULL - # Add levels to matrix - for (i in seq(nrow(class2featuresIndices))) { - c2fMatrix[class2featuresIndices[i, "class"], - class2featuresIndices[i, "feature"]] <- - class2featuresIndices[i, "level"] * - class2featuresIndices[i, "direction"] - } - # Generate list of rules for each class rules <- lapply(levels(class), function(cl, class2featuresIndices) { @@ -338,6 +324,5 @@ subUnderscore <- function(x, n) unlist(lapply( names(rules) <- levels(class) return(list( - c2fMatrix = c2fMatrix, rules = rules)) } diff --git a/README.md b/README.md index 70522e99..9bed5990 100755 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ library(devtools) install_github("campbio/celda") ``` -For `R-3.5` users, please install from the `R_3_5` branch. This version of **celda** is identical to the most recent release of **celda** (`master` branch) except it also works on `R-3.5`. **NOTE:** This branch is no longer updated. Please use `R-3.6` versions. +For `R-3.5` users, please install from the `R_3_5` branch. This version of **celda** is identical to the most recent release of **celda** (`master` branch) except it also works on `R-3.5`. ``` library(devtools) install_github("campbio/celda@R_3_5") @@ -55,4 +55,9 @@ The vignette in HTML format showing how to use **celda** is available on Biocond Example vignette of doing single-cell RNA-seq data decontamination using DecontX is available [here](http://bioconductor.org/packages/release/bioc/vignettes/celda/inst/doc/DecontX-analysis.html). ## For developers -Check out our [Wiki](https://github.com/campbio/celda/wiki) for [coding style guide](https://github.com/campbio/celda/wiki/Celda-Development-Coding-Style-Guide) if you want to contribute! +Check out our [Wiki](https://github.com/campbio/celda/wiki) for developer's guide if you want to contribute! +- [Celda Development Coding Style Guide](https://github.com/campbio/celda/wiki/Celda-Development-Coding-Style-Guide) +- [Celda Development Robust and Efficient Code](https://github.com/campbio/celda/wiki/Celda-Development-Robust-and-Efficient-Code) +- [Celda Development Rstudio configuration](https://github.com/campbio/celda/wiki/Celda-Development-Rstudio-configuration) +- [FAQ on how to use celda](https://github.com/campbio/celda/wiki/FAQ-on-how-to-use-celda) +- [FAQ on package development](https://github.com/campbio/celda/wiki/FAQ-on-package-development) diff --git a/inst/NEWS b/inst/NEWS index d07e0095..3fe3af08 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,8 +1,20 @@ +Changes in version 1.1.6 (2019-07-16): + + o Add multiclass decision tree + +Changes in version 1.1.4 (2019-05-28): + + o Add Alternate headings support for plotDimReduceFeature + +Changes in version 1.1.3 (2019-05-14): + + o Add multiclass decision tree (MCDT) cell cluster annotation + Changes in version 1.1.2 (2019-05-14): o Fix a bug in celdaHeatmap -Changes in version 1.1.1 (2019-05-09): +Changes in version 1.0.1 (2019-05-09): o Default seed setting to maintain reproducibility diff --git a/man/bestLogLikelihood-celdaModel-method.Rd b/man/bestLogLikelihood-celdaModel-method.Rd index fdcf64b0..5497314c 100644 --- a/man/bestLogLikelihood-celdaModel-method.Rd +++ b/man/bestLogLikelihood-celdaModel-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{bestLogLikelihood,celdaModel-method} \alias{bestLogLikelihood,celdaModel-method} \title{Get the log-likelihood} diff --git a/man/celdaGridSearch.Rd b/man/celdaGridSearch.Rd index dbea710c..bd99dd36 100644 --- a/man/celdaGridSearch.Rd +++ b/man/celdaGridSearch.Rd @@ -4,9 +4,19 @@ \alias{celdaGridSearch} \title{Run Celda in parallel with multiple parameters} \usage{ -celdaGridSearch(counts, model, paramsTest, paramsFixed = NULL, - maxIter = 200, nchains = 3, cores = 1, bestOnly = TRUE, - perplexity = TRUE, verbose = TRUE, logfilePrefix = "Celda") +celdaGridSearch( + counts, + model, + paramsTest, + paramsFixed = NULL, + maxIter = 200, + nchains = 3, + cores = 1, + bestOnly = TRUE, + perplexity = TRUE, + verbose = TRUE, + logfilePrefix = "Celda" +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaHeatmap-celda_C-method.Rd b/man/celdaHeatmap-celda_C-method.Rd index 69cf358a..0c31e134 100644 --- a/man/celdaHeatmap-celda_C-method.Rd +++ b/man/celdaHeatmap-celda_C-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{celdaHeatmap,celda_C-method} \alias{celdaHeatmap,celda_C-method} \title{Heatmap for celda_C} diff --git a/man/celdaHeatmap-celda_CG-method.Rd b/man/celdaHeatmap-celda_CG-method.Rd index b98407a7..f2908bf3 100644 --- a/man/celdaHeatmap-celda_CG-method.Rd +++ b/man/celdaHeatmap-celda_CG-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{celdaHeatmap,celda_CG-method} \alias{celdaHeatmap,celda_CG-method} \title{Heatmap for celda_CG} diff --git a/man/celdaHeatmap-celda_G-method.Rd b/man/celdaHeatmap-celda_G-method.Rd index 351acc05..5560653d 100644 --- a/man/celdaHeatmap-celda_G-method.Rd +++ b/man/celdaHeatmap-celda_G-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{celdaHeatmap,celda_G-method} \alias{celdaHeatmap,celda_G-method} \title{Heatmap for celda_CG} diff --git a/man/celdaPerplexity-celdaList-method.Rd b/man/celdaPerplexity-celdaList-method.Rd index cf69c1d4..3854be14 100644 --- a/man/celdaPerplexity-celdaList-method.Rd +++ b/man/celdaPerplexity-celdaList-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{celdaPerplexity,celdaList-method} \alias{celdaPerplexity,celdaList-method} \title{Get perplexity for every model in a celdaList} diff --git a/man/celdaProbabilityMap-celda_C-method.Rd b/man/celdaProbabilityMap-celda_C-method.Rd index 5bf494b9..4d4e04a1 100644 --- a/man/celdaProbabilityMap-celda_C-method.Rd +++ b/man/celdaProbabilityMap-celda_C-method.Rd @@ -1,12 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{celdaProbabilityMap,celda_C-method} \alias{celdaProbabilityMap,celda_C-method} \title{Probability map for a celda_C model} \usage{ -\S4method{celdaProbabilityMap}{celda_C}(counts, celdaMod, - level = c("sample"), ...) +\S4method{celdaProbabilityMap}{celda_C}(counts, celdaMod, level = c("sample"), ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaProbabilityMap-celda_CG-method.Rd b/man/celdaProbabilityMap-celda_CG-method.Rd index fabc06c3..de724ed4 100644 --- a/man/celdaProbabilityMap-celda_CG-method.Rd +++ b/man/celdaProbabilityMap-celda_CG-method.Rd @@ -1,12 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{celdaProbabilityMap,celda_CG-method} \alias{celdaProbabilityMap,celda_CG-method} \title{Probability map for a celda_CG model} \usage{ -\S4method{celdaProbabilityMap}{celda_CG}(counts, celdaMod, - level = c("cellPopulation", "sample"), ...) +\S4method{celdaProbabilityMap}{celda_CG}( + counts, + celdaMod, + level = c("cellPopulation", "sample"), + ... +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaTsne-celda_C-method.Rd b/man/celdaTsne-celda_C-method.Rd index cf9d1e39..e4f0f65a 100644 --- a/man/celdaTsne-celda_C-method.Rd +++ b/man/celdaTsne-celda_C-method.Rd @@ -1,13 +1,19 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{celdaTsne,celda_C-method} \alias{celdaTsne,celda_C-method} \title{tSNE for celda_C} \usage{ -\S4method{celdaTsne}{celda_C}(counts, celdaMod, maxCells = 25000, - minClusterSize = 100, initialDims = 20, modules = NULL, - perplexity = 20, maxIter = 2500, seed = 12345) +\S4method{celdaTsne}{celda_C}( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + initialDims = 20, + perplexity = 20, + maxIter = 2500, + seed = 12345 +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -18,7 +24,8 @@ cells. This matrix should be the same as the one used to generate \item{maxCells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > maxCells. Larger numbers of cells -requires more memory. Default 25000.} +requires more memory. If NULL, no subsampling will be performed. +Default NULL.} \item{minClusterSize}{Integer. Do not subsample cell clusters below this threshold. Default 100.} @@ -27,9 +34,6 @@ threshold. Default 100.} of the dataset. The top 'initialDims' principal components will be used for tSNE. Default 20.} -\item{modules}{Integer vector. Determines which features modules to use for -tSNE. If NULL, all modules will be used. Default NULL.} - \item{perplexity}{Numeric. Perplexity parameter for tSNE. Default 20.} \item{maxIter}{Integer. Maximum number of iterations in tSNE generation. diff --git a/man/celdaTsne-celda_CG-method.Rd b/man/celdaTsne-celda_CG-method.Rd index 8f0db699..f64dba37 100644 --- a/man/celdaTsne-celda_CG-method.Rd +++ b/man/celdaTsne-celda_CG-method.Rd @@ -1,13 +1,20 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{celdaTsne,celda_CG-method} \alias{celdaTsne,celda_CG-method} \title{tSNE for celda_CG} \usage{ -\S4method{celdaTsne}{celda_CG}(counts, celdaMod, maxCells = 25000, - minClusterSize = 100, initialDims = 20, modules = NULL, - perplexity = 20, maxIter = 2500, seed = 12345) +\S4method{celdaTsne}{celda_CG}( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + initialDims = 20, + modules = NULL, + perplexity = 20, + maxIter = 2500, + seed = 12345 +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -18,7 +25,8 @@ cells. This matrix should be the same as the one used to generate \item{maxCells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > maxCells. Larger numbers of cells -requires more memory. Default 25000.} +requires more memory. If NULL, no subsampling will be performed. +Default NULL.} \item{minClusterSize}{Integer. Do not subsample cell clusters below this threshold. Default 100.} diff --git a/man/celdaTsne-celda_G-method.Rd b/man/celdaTsne-celda_G-method.Rd index 8ec2e43d..05144e1d 100644 --- a/man/celdaTsne-celda_G-method.Rd +++ b/man/celdaTsne-celda_G-method.Rd @@ -1,13 +1,20 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{celdaTsne,celda_G-method} \alias{celdaTsne,celda_G-method} \title{tSNE for celda_G} \usage{ -\S4method{celdaTsne}{celda_G}(counts, celdaMod, maxCells = 25000, - minClusterSize = 100, initialDims = 20, modules = NULL, - perplexity = 20, maxIter = 2500, seed = 12345) +\S4method{celdaTsne}{celda_G}( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + initialDims = 20, + modules = NULL, + perplexity = 20, + maxIter = 2500, + seed = 12345 +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -18,7 +25,8 @@ cells. This matrix should be the same as the one used to generate \item{maxCells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(conts) > maxCells. Larger numbers of cells -requires more memory. Default 10000.} +requires more memory. If NULL, no subsampling will be performed. +Default NULL.} \item{minClusterSize}{Integer. Do not subsample cell clusters below this threshold. Default 100.} diff --git a/man/celdaTsne.Rd b/man/celdaTsne.Rd index 14b1286f..a9027f18 100644 --- a/man/celdaTsne.Rd +++ b/man/celdaTsne.Rd @@ -4,9 +4,17 @@ \alias{celdaTsne} \title{Embeds cells in two dimensions using tSNE based on celda_CG results.} \usage{ -celdaTsne(counts, celdaMod, maxCells = 25000, minClusterSize = 100, - initialDims = 20, modules = NULL, perplexity = 20, - maxIter = 2500, ...) +celdaTsne( + counts, + celdaMod, + maxCells = 25000, + minClusterSize = 100, + initialDims = 20, + modules = NULL, + perplexity = 20, + maxIter = 2500, + ... +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/celdaUmap-celda_C-method.Rd b/man/celdaUmap-celda_C-method.Rd index 8ed12672..f558a5d8 100644 --- a/man/celdaUmap-celda_C-method.Rd +++ b/man/celdaUmap-celda_C-method.Rd @@ -1,13 +1,23 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{celdaUmap,celda_C-method} \alias{celdaUmap,celda_C-method} \title{umap for celda_C} \usage{ -\S4method{celdaUmap}{celda_C}(counts, celdaMod, maxCells = 25000, - minClusterSize = 100, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) +\S4method{celdaUmap}{celda_C}( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + seed = 12345, + nNeighbors = 30, + minDist = 0.75, + spread = 1, + pca = TRUE, + initialDims = 50, + cores = 1, + ... +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -18,23 +28,45 @@ cells. This matrix should be the same as the one used to generate \item{maxCells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > maxCells. Larger numbers of cells -requires more memory. Default 25000.} +requires more memory. If NULL, no subsampling will be performed. +Default NULL.} \item{minClusterSize}{Integer. Do not subsample cell clusters below this threshold. Default 100.} -\item{modules}{Integer vector. Determines which features modules to use for -UMAP. If NULL, all modules will be used. Default NULL.} - \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} -\item{umapConfig}{An object of class "umap.config" specifying parameters to -the UMAP algorithm.} +\item{nNeighbors}{The size of local neighborhood used for +manifold approximation. Larger values result in more global +views of the manifold, while smaller values result in more +local data being preserved. Default 30. +See `?uwot::umap` for more information.} + +\item{minDist}{The effective minimum distance between embedded points. +Smaller values will result in a more clustered/clumped +embedding where nearby points on the manifold are drawn +closer together, while larger values will result on a more +even dispersal of points. Default 0.2. +See `?uwot::umap` for more information.} + +\item{spread}{The effective scale of embedded points. In combination with +‘min_dist’, this determines how clustered/clumped the +embedded points are. Default 1. See `?uwot::umap` for more information.} + +\item{pca}{Logical. Whether to perform +dimensionality reduction with PCA before UMAP.} + +\item{initialDims}{Integer. Number of dimensions from PCA to use as +input in UMAP. Default 50.} + +\item{cores}{Number of threads to use. Default 1.} + +\item{...}{Other parameters to pass to `uwot::umap`.} } \value{ -A two column matrix of umap coordinates +A two column matrix of UMAP coordinates } \description{ Embeds cells in two dimensions using umap based on a `celda_C` diff --git a/man/celdaUmap-celda_CG-method.Rd b/man/celdaUmap-celda_CG-method.Rd index 0c9c399f..bfe4165d 100644 --- a/man/celdaUmap-celda_CG-method.Rd +++ b/man/celdaUmap-celda_CG-method.Rd @@ -1,13 +1,22 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{celdaUmap,celda_CG-method} \alias{celdaUmap,celda_CG-method} \title{umap for celda_CG} \usage{ -\S4method{celdaUmap}{celda_CG}(counts, celdaMod, maxCells = 25000, - minClusterSize = 100, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) +\S4method{celdaUmap}{celda_CG}( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + modules = NULL, + seed = 12345, + nNeighbors = 30, + minDist = 0.75, + spread = 1, + cores = 1, + ... +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -18,20 +27,40 @@ cells. This matrix should be the same as the one used to generate \item{maxCells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > maxCells. Larger numbers of cells -requires more memory. Default 25000.} +requires more memory. If NULL, no subsampling will be performed. +Default NULL.} \item{minClusterSize}{Integer. Do not subsample cell clusters below this threshold. Default 100.} \item{modules}{Integer vector. Determines which features modules to use for -tSNE. If NULL, all modules will be used. Default NULL.} +UMAP. If NULL, all modules will be used. Default NULL.} \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} -\item{umapConfig}{Object of class `umap.config`. Configures parameters for -umap. Default `umap::umap.defaults`.} +\item{nNeighbors}{The size of local neighborhood used for +manifold approximation. Larger values result in more global +views of the manifold, while smaller values result in more +local data being preserved. Default 30. +See `?uwot::umap` for more information.} + +\item{minDist}{The effective minimum distance between embedded points. +Smaller values will result in a more clustered/clumped +embedding where nearby points on the manifold are drawn +closer together, while larger values will result on a more +even dispersal of points. Default 0.2. +See `?uwot::umap` for more information.} + +\item{spread}{The effective scale of embedded points. In combination with +‘min_dist’, this determines how clustered/clumped the +embedded points are. Default 1. +See `?uwot::umap` for more information.} + +\item{cores}{Number of threads to use. Default 1.} + +\item{...}{Other parameters to pass to `uwot::umap`.} } \value{ A two column matrix of umap coordinates diff --git a/man/celdaUmap-celda_G-method.Rd b/man/celdaUmap-celda_G-method.Rd index 9d772b77..9852ffc6 100644 --- a/man/celdaUmap-celda_G-method.Rd +++ b/man/celdaUmap-celda_G-method.Rd @@ -1,13 +1,22 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{celdaUmap,celda_G-method} \alias{celdaUmap,celda_G-method} \title{umap for celda_G} \usage{ -\S4method{celdaUmap}{celda_G}(counts, celdaMod, maxCells = 25000, - minClusterSize = 100, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) +\S4method{celdaUmap}{celda_G}( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + modules = NULL, + seed = 12345, + nNeighbors = 30, + minDist = 0.2, + spread = 1, + cores = 1, + ... +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -18,20 +27,40 @@ cells. This matrix should be the same as the one used to generate \item{maxCells}{Integer. Maximum number of cells to plot. Cells will be randomly subsampled if ncol(counts) > maxCells. Larger numbers of cells -requires more memory. Default 25000.} +requires more memory. If NULL, no subsampling will be performed. +Default NULL.} \item{minClusterSize}{Integer. Do not subsample cell clusters below this threshold. Default 100.} \item{modules}{Integer vector. Determines which features modules to use for -tSNE. If NULL, all modules will be used. Default NULL.} +UMAP. If NULL, all modules will be used. Default NULL.} \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} -\item{umapConfig}{Object of class `umap.config`. Configures parameters for -umap. Default `umap::umap.defaults`.} +\item{nNeighbors}{The size of local neighborhood used for +manifold approximation. Larger values result in more global +views of the manifold, while smaller values result in more +local data being preserved. Default 30. +See `?uwot::umap` for more information.} + +\item{minDist}{The effective minimum distance between embedded points. +Smaller values will result in a more clustered/clumped +embedding where nearby points on the manifold are drawn +closer together, while larger values will result on a more +even dispersal of points. Default 0.2. +See `?uwot::umap` for more information.} + +\item{spread}{The effective scale of embedded points. In combination with +‘min_dist’, this determines how clustered/clumped the +embedded points are. Default 1. +See `?uwot::umap` for more information.} + +\item{cores}{Number of threads to use. Default 1.} + +\item{...}{Other parameters to pass to `uwot::umap`.} } \value{ A two column matrix of umap coordinates diff --git a/man/celdaUmap.Rd b/man/celdaUmap.Rd index af6667b6..d4ff139d 100644 --- a/man/celdaUmap.Rd +++ b/man/celdaUmap.Rd @@ -4,9 +4,15 @@ \alias{celdaUmap} \title{Embeds cells in two dimensions using umap.} \usage{ -celdaUmap(counts, celdaMod, maxCells = 25000, minClusterSize = 100, - initialDims = 20, modules = NULL, seed = 12345, - umapConfig = umap::umap.defaults) +celdaUmap( + counts, + celdaMod, + maxCells = NULL, + minClusterSize = 100, + modules = NULL, + seed = 12345, + ... +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -22,28 +28,21 @@ requires more memory. Default 25000.} \item{minClusterSize}{Integer. Do not subsample cell clusters below this threshold. Default 100.} -\item{initialDims}{Integer. PCA will be used to reduce the dimentionality -of the dataset. The top 'initialDims' principal components will be used -for umap. Default 20.} - \item{modules}{Integer vector. Determines which features modules to use for tSNE. If NULL, all modules will be used. Default NULL.} \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to -\link[withr]{with_seed} are made.} - -\item{umapConfig}{An object of class "umapConfig" specifying parameters to +\link[withr]{with_seed} are made. the UMAP algorithm.} + +\item{...}{Additional parameters to `uwot::umap`} } \value{ -Numeric Matrix of dimension `ncol(counts)` x 2, colums representing - the "X" and "Y" coordinates in the data's t-SNE represetation. +A two column matrix of UMAP coordinates#' @examples +data(celdaCGSim, celdaCGMod) +umapRes <- celdaUmap(celdaCGSim$counts, celdaCGMod) } \description{ Embeds cells in two dimensions using umap. } -\examples{ -data(celdaCGSim, celdaCGMod) -tsneRes <- celdaUmap(celdaCGSim$counts, celdaCGMod) -} diff --git a/man/celda_C.Rd b/man/celda_C.Rd index f4389270..0c33c93b 100644 --- a/man/celda_C.Rd +++ b/man/celda_C.Rd @@ -4,12 +4,25 @@ \alias{celda_C} \title{Cell clustering with Celda} \usage{ -celda_C(counts, sampleLabel = NULL, K, alpha = 1, beta = 1, - algorithm = c("EM", "Gibbs"), stopIter = 10, maxIter = 200, - splitOnIter = 10, splitOnLast = TRUE, seed = 12345, nchains = 3, +celda_C( + counts, + sampleLabel = NULL, + K, + alpha = 1, + beta = 1, + algorithm = c("EM", "Gibbs"), + stopIter = 10, + maxIter = 200, + splitOnIter = 10, + splitOnLast = TRUE, + seed = 12345, + nchains = 3, zInitialize = c("split", "random", "predefined"), - countChecksum = NULL, zInit = NULL, logfile = NULL, - verbose = TRUE) + countChecksum = NULL, + zInit = NULL, + logfile = NULL, + verbose = TRUE +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -75,7 +88,7 @@ can only be used when `initialize = 'random'`. Default NULL.} } \value{ An object of class `celda_C` with the cell population clusters - stored in in `z`. + stored in `z`. } \description{ Clusters the columns of a count matrix containing single-cell diff --git a/man/celda_CG.Rd b/man/celda_CG.Rd index c12e0ecf..c2fc946d 100644 --- a/man/celda_CG.Rd +++ b/man/celda_CG.Rd @@ -4,14 +4,30 @@ \alias{celda_CG} \title{Cell and feature clustering with Celda} \usage{ -celda_CG(counts, sampleLabel = NULL, K, L, alpha = 1, beta = 1, - delta = 1, gamma = 1, algorithm = c("EM", "Gibbs"), - stopIter = 10, maxIter = 200, splitOnIter = 10, - splitOnLast = TRUE, seed = 12345, nchains = 3, +celda_CG( + counts, + sampleLabel = NULL, + K, + L, + alpha = 1, + beta = 1, + delta = 1, + gamma = 1, + algorithm = c("EM", "Gibbs"), + stopIter = 10, + maxIter = 200, + splitOnIter = 10, + splitOnLast = TRUE, + seed = 12345, + nchains = 3, zInitialize = c("split", "random", "predefined"), yInitialize = c("split", "random", "predefined"), - countChecksum = NULL, zInit = NULL, yInit = NULL, logfile = NULL, - verbose = TRUE) + countChecksum = NULL, + zInit = NULL, + yInit = NULL, + logfile = NULL, + verbose = TRUE +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent @@ -94,7 +110,7 @@ starting values for each feature will be randomly sampled from 1:L. } \value{ An object of class `celda_CG` with the cell populations clusters - stored in in `z` and feature module clusters stored in `y`. + stored in `z` and feature module clusters stored in `y`. } \description{ Clusters the rows and columns of a count matrix containing diff --git a/man/celda_G.Rd b/man/celda_G.Rd index 520102ca..85dcfa87 100644 --- a/man/celda_G.Rd +++ b/man/celda_G.Rd @@ -4,11 +4,24 @@ \alias{celda_G} \title{Feature clustering with Celda} \usage{ -celda_G(counts, L, beta = 1, delta = 1, gamma = 1, stopIter = 10, - maxIter = 200, splitOnIter = 10, splitOnLast = TRUE, - seed = 12345, nchains = 3, yInitialize = c("split", "random", - "predefined"), countChecksum = NULL, yInit = NULL, logfile = NULL, - verbose = TRUE) +celda_G( + counts, + L, + beta = 1, + delta = 1, + gamma = 1, + stopIter = 10, + maxIter = 200, + splitOnIter = 10, + splitOnLast = TRUE, + seed = 12345, + nchains = 3, + yInitialize = c("split", "random", "predefined"), + countChecksum = NULL, + yInit = NULL, + logfile = NULL, + verbose = TRUE +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/clusterProbability-celda_C-method.Rd b/man/clusterProbability-celda_C-method.Rd index 16dbf199..16a165c6 100644 --- a/man/clusterProbability-celda_C-method.Rd +++ b/man/clusterProbability-celda_C-method.Rd @@ -1,13 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{clusterProbability,celda_C-method} \alias{clusterProbability,celda_C-method} \title{Conditional probabilities for cells in subpopulations from a Celda_C model} \usage{ -\S4method{clusterProbability}{celda_C}(counts, celdaMod, log = FALSE, - ...) +\S4method{clusterProbability}{celda_C}(counts, celdaMod, log = FALSE, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/clusterProbability-celda_CG-method.Rd b/man/clusterProbability-celda_CG-method.Rd index 98a80896..c5828a7a 100644 --- a/man/clusterProbability-celda_CG-method.Rd +++ b/man/clusterProbability-celda_CG-method.Rd @@ -1,13 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{clusterProbability,celda_CG-method} \alias{clusterProbability,celda_CG-method} \title{Conditional probabilities for cells and features from a Celda_CG model} \usage{ -\S4method{clusterProbability}{celda_CG}(counts, celdaMod, log = FALSE, - ...) +\S4method{clusterProbability}{celda_CG}(counts, celdaMod, log = FALSE, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/clusterProbability-celda_G-method.Rd b/man/clusterProbability-celda_G-method.Rd index 8eb2c673..54826b5e 100644 --- a/man/clusterProbability-celda_G-method.Rd +++ b/man/clusterProbability-celda_G-method.Rd @@ -1,12 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{clusterProbability,celda_G-method} \alias{clusterProbability,celda_G-method} \title{Conditional probabilities for features in modules from a Celda_G model} \usage{ -\S4method{clusterProbability}{celda_G}(counts, celdaMod, log = FALSE, - ...) +\S4method{clusterProbability}{celda_G}(counts, celdaMod, log = FALSE, ...) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/clusters-celdaModel-method.Rd b/man/clusters-celdaModel-method.Rd index c8110f06..fcfdde8e 100644 --- a/man/clusters-celdaModel-method.Rd +++ b/man/clusters-celdaModel-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{clusters,celdaModel-method} \alias{clusters,celdaModel-method} \title{Get clustering outcomes from a celdaModel} diff --git a/man/countChecksum-celdaList-method.Rd b/man/countChecksum-celdaList-method.Rd index b6f683a8..8fcb23d8 100644 --- a/man/countChecksum-celdaList-method.Rd +++ b/man/countChecksum-celdaList-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{countChecksum,celdaList-method} \alias{countChecksum,celdaList-method} \title{Get the MD5 hash of the count matrix from the celdaList} diff --git a/man/decontX.Rd b/man/decontX.Rd index d0222870..3a5f3bad 100644 --- a/man/decontX.Rd +++ b/man/decontX.Rd @@ -2,50 +2,130 @@ % Please edit documentation in R/decon.R \name{decontX} \alias{decontX} -\title{Decontaminate count matrix} +\alias{decontX,SingleCellExperiment-method} +\alias{decontX,ANY-method} +\title{Contamination estimation with decontX} \usage{ -decontX(counts, z = NULL, batch = NULL, maxIter = 200, delta = 10, - logfile = NULL, verbose = TRUE, seed = 12345) +\S4method{decontX}{SingleCellExperiment}( + x, + assayName = "counts", + z = NULL, + batch = NULL, + maxIter = 500, + delta = 10, + convergence = 0.001, + iterLogLik = 10, + varGenes = 5000, + dbscanEps = 1, + L = 50, + seed = 12345, + logfile = NULL, + verbose = TRUE +) + +\S4method{decontX}{ANY}( + x, + z = NULL, + batch = NULL, + maxIter = 500, + delta = 10, + convergence = 0.001, + iterLogLik = 10, + varGenes = 5000, + dbscanEps = 1, + L = 50, + seed = 12345, + logfile = NULL, + verbose = TRUE +) } \arguments{ -\item{counts}{Numeric/Integer matrix. Observed count matrix, rows represent -features and columns represent cells.} +\item{x}{A numeric matrix of counts or a \linkS4class{SingleCellExperiment} +with the matrix located in the assay slot under \code{assayName}. +Cells in each batch will be subsetted and converted to a sparse matrix +of class \code{dgCMatrix} from package \link{Matrix} before analysis.} -\item{z}{Integer vector. Cell population labels. Default NULL.} +\item{assayName}{Character. Name of the assay to use if \code{x} is a +\linkS4class{SingleCellExperiment}.} -\item{batch}{Integer vector. Cell batch labels. Default NULL.} +\item{z}{Numeric or character vector. Cell cluster labels. If NULL, +Celda will be used to reduce the dimensionality of the dataset +to 'L' modules, '\link[uwot]{umap}' from the 'uwot' package +will be used to further reduce the dataset to 2 dimenions and +the '\link[dbscan]{dbscan}' function from the 'dbscan' package +will be used to identify clusters of broad cell types. Default NULL.} -\item{maxIter}{Integer. Maximum iterations of EM algorithm. Default to be -200.} +\item{batch}{Numeric or character vector. Batch labels for cells. +If batch labels are supplied, DecontX is run on cells from each +batch separately. Cells run in different channels or assays +should be considered different batches. Default NULL.} -\item{delta}{Numeric. Symmetric concentration parameter for Theta. Default -to be 10.} +\item{maxIter}{Integer. Maximum iterations of the EM algorithm. Default 500.} -\item{logfile}{Character. Messages will be redirected to a file named -`logfile`. If NULL, messages will be printed to stdout. Default NULL.} +\item{delta}{Numeric. Symmetric Dirichlet concentration parameter +to initialize theta. Default 10.} -\item{verbose}{Logical. Whether to print log messages. Default TRUE.} +\item{convergence}{Numeric. The EM algorithm will be stopped if the maximum +difference in the contamination estimates between the previous and +current iterations is less than this. Default 0.001.} + +\item{iterLogLik}{Integer. Calculate log likelihood every 'iterLogLik' +iteration. Default 10.} + +\item{varGenes}{Integer. The number of variable genes to use in +Celda clustering. Variability is calcualted using +\code{\link[scran]{modelGeneVar}} function from the 'scran' package. +Used only when z is not provided. Default 5000.} + +\item{dbscanEps}{Numeric. The clustering resolution parameter +used in '\link[dbscan]{dbscan}' to estimate broad cell clusters. +Used only when z is not provided. Default 1.} + +\item{L}{Integer. Number of modules for Celda clustering. Used to reduce +the dimensionality of the dataset before applying UMAP and dbscan. +Used only when z is not provided. Default 50.} \item{seed}{Integer. Passed to \link[withr]{with_seed}. For reproducibility, a default value of 12345 is used. If NULL, no calls to \link[withr]{with_seed} are made.} + +\item{logfile}{Character. Messages will be redirected to a file named +`logfile`. If NULL, messages will be printed to stdout. Default NULL.} + +\item{verbose}{Logical. Whether to print log messages. Default TRUE.} } \value{ -A list object which contains the decontaminated count matrix and - related parameters. +If \code{x} is a matrix-like object, a list will be returned +with the following items: +\describe{ +\item{\code{decontXcounts}:}{The decontaminated matrix. Values obtained +from the variational inference procedure may be non-integer. However, +integer counts can be obtained by rounding, +e.g. \code{round(decontXcounts)}.} +\item{\code{contamination}:}{Percentage of contamination in each cell.} +\item{\code{estimates}:}{List of estimated parameters for each batch. If z +was not supplied, then the UMAP coordinates used to generated cell +cluster labels will also be stored here.} +\item{\code{z}:}{Cell population/cluster labels used for analysis.} +\item{\code{runParams}:}{List of arguments used in the function call.} +} + +If \code{x} is a \linkS4class{SingleCellExperiment}, then the decontaminated +counts will be stored as an assay and can be accessed with +\code{decontXcounts(x)}. The contamination values and cluster labels +will be stored in \code{colData(x)}. \code{estimates} and \code{runParams} +will be stored in \code{metadata(x)$decontX}. If z was not supplied, then +the UMAPs used to generated cell cluster labels will be stored in +\code{reducedDims} slot in \code{x} } \description{ -This function updates decontamination on dataset with multiple - batches. +Identifies contamination from factors such as ambient RNA +in single cell genomic datasets. } \examples{ -data(contaminationSim) -deconC <- decontX( - counts = contaminationSim$rmat + contaminationSim$cmat, - z = contaminationSim$z, maxIter = 3 -) -deconBg <- decontX( - counts = contaminationSim$rmat + contaminationSim$cmat, - maxIter = 3 -) +s <- simulateContaminatedMatrix() +result <- decontX(s$observedCounts, s$z) +contamination <- colSums(s$observedCounts - s$nativeCounts) / + colSums(s$observedCounts) +plot(contamination, result$contamination) } diff --git a/man/differentialExpression.Rd b/man/differentialExpression.Rd index 698a56b4..3fdf5102 100644 --- a/man/differentialExpression.Rd +++ b/man/differentialExpression.Rd @@ -4,8 +4,15 @@ \alias{differentialExpression} \title{Differential expression for cell subpopulations using MAST} \usage{ -differentialExpression(counts, celdaMod, c1, c2 = NULL, - onlyPos = FALSE, log2fcThreshold = NULL, fdrThreshold = 1) +differentialExpression( + counts, + celdaMod, + c1, + c2 = NULL, + onlyPos = FALSE, + log2fcThreshold = NULL, + fdrThreshold = 1 +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/distinctColors.Rd b/man/distinctColors.Rd index 4fc38644..13cf5b60 100644 --- a/man/distinctColors.Rd +++ b/man/distinctColors.Rd @@ -4,9 +4,12 @@ \alias{distinctColors} \title{Create a color palette} \usage{ -distinctColors(n, hues = c("red", "cyan", "orange", "blue", "yellow", - "purple", "green", "magenta"), saturationRange = c(0.7, 1), - valueRange = c(0.7, 1)) +distinctColors( + n, + hues = c("red", "cyan", "orange", "blue", "yellow", "purple", "green", "magenta"), + saturationRange = c(0.7, 1), + valueRange = c(0.7, 1) +) } \arguments{ \item{n}{Integer. Number of colors to generate.} diff --git a/man/factorizeMatrix-celda_C-method.Rd b/man/factorizeMatrix-celda_C-method.Rd index d2a29ef9..244b90f8 100644 --- a/man/factorizeMatrix-celda_C-method.Rd +++ b/man/factorizeMatrix-celda_C-method.Rd @@ -1,12 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{factorizeMatrix,celda_C-method} \alias{factorizeMatrix,celda_C-method} \title{Matrix factorization for results from celda_C()} \usage{ -\S4method{factorizeMatrix}{celda_C}(counts, celdaMod, type = c("counts", - "proportion", "posterior")) +\S4method{factorizeMatrix}{celda_C}( + counts, + celdaMod, + type = c("counts", "proportion", "posterior") +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/factorizeMatrix-celda_CG-method.Rd b/man/factorizeMatrix-celda_CG-method.Rd index eba62840..8543d614 100644 --- a/man/factorizeMatrix-celda_CG-method.Rd +++ b/man/factorizeMatrix-celda_CG-method.Rd @@ -1,12 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{factorizeMatrix,celda_CG-method} \alias{factorizeMatrix,celda_CG-method} \title{Matrix factorization for results from celda_CG} \usage{ -\S4method{factorizeMatrix}{celda_CG}(counts, celdaMod, type = c("counts", - "proportion", "posterior")) +\S4method{factorizeMatrix}{celda_CG}( + counts, + celdaMod, + type = c("counts", "proportion", "posterior") +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/factorizeMatrix-celda_G-method.Rd b/man/factorizeMatrix-celda_G-method.Rd index 9b5d2dad..7dbdd9de 100644 --- a/man/factorizeMatrix-celda_G-method.Rd +++ b/man/factorizeMatrix-celda_G-method.Rd @@ -1,12 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{factorizeMatrix,celda_G-method} \alias{factorizeMatrix,celda_G-method} \title{Matrix factorization for results from celda_G} \usage{ -\S4method{factorizeMatrix}{celda_G}(counts, celdaMod, type = c("counts", - "proportion", "posterior")) +\S4method{factorizeMatrix}{celda_G}( + counts, + celdaMod, + type = c("counts", "proportion", "posterior") +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/factorizeMatrix.Rd b/man/factorizeMatrix.Rd index e906ee71..35487343 100644 --- a/man/factorizeMatrix.Rd +++ b/man/factorizeMatrix.Rd @@ -5,8 +5,11 @@ \title{Generate factorized matrices showing each feature's influence on cell / gene clustering} \usage{ -factorizeMatrix(counts, celdaMod, type = c("counts", "proportion", - "posterior")) +factorizeMatrix( + counts, + celdaMod, + type = c("counts", "proportion", "posterior") +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/featureModuleLookup-celda_C-method.Rd b/man/featureModuleLookup-celda_C-method.Rd index 07eeb520..2d15b0cf 100644 --- a/man/featureModuleLookup-celda_C-method.Rd +++ b/man/featureModuleLookup-celda_C-method.Rd @@ -1,12 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{featureModuleLookup,celda_C-method} \alias{featureModuleLookup,celda_C-method} \title{Lookup the module of a feature} \usage{ -\S4method{featureModuleLookup}{celda_C}(counts, celdaMod, feature, - exactMatch = TRUE) +\S4method{featureModuleLookup}{celda_C}(counts, celdaMod, feature, exactMatch = TRUE) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/featureModuleLookup-celda_CG-method.Rd b/man/featureModuleLookup-celda_CG-method.Rd index d16c9a3f..7404cc03 100644 --- a/man/featureModuleLookup-celda_CG-method.Rd +++ b/man/featureModuleLookup-celda_CG-method.Rd @@ -1,12 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{featureModuleLookup,celda_CG-method} \alias{featureModuleLookup,celda_CG-method} \title{Lookup the module of a feature} \usage{ -\S4method{featureModuleLookup}{celda_CG}(counts, celdaMod, feature, - exactMatch = TRUE) +\S4method{featureModuleLookup}{celda_CG}(counts, celdaMod, feature, exactMatch = TRUE) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/featureModuleLookup-celda_G-method.Rd b/man/featureModuleLookup-celda_G-method.Rd index b4b5274d..b2481c08 100644 --- a/man/featureModuleLookup-celda_G-method.Rd +++ b/man/featureModuleLookup-celda_G-method.Rd @@ -1,12 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{featureModuleLookup,celda_G-method} \alias{featureModuleLookup,celda_G-method} \title{Lookup the module of a feature} \usage{ -\S4method{featureModuleLookup}{celda_G}(counts, celdaMod, feature, - exactMatch = TRUE) +\S4method{featureModuleLookup}{celda_G}(counts, celdaMod, feature, exactMatch = TRUE) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/buildTreeHybrid.Rd b/man/findMarkers.Rd similarity index 83% rename from man/buildTreeHybrid.Rd rename to man/findMarkers.Rd index 6c9528d2..403315ce 100644 --- a/man/buildTreeHybrid.Rd +++ b/man/findMarkers.Rd @@ -1,18 +1,28 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/buildTreeHybrid.R -\name{buildTreeHybrid} -\alias{buildTreeHybrid} +% Please edit documentation in R/findMarkers.R +\name{findMarkers} +\alias{findMarkers} \title{Generate decision tree from single-cell clustering output.} \usage{ -buildTreeHybrid(features, class, oneoffMetric = c("modified F1", - "pairwise AUC"), threshold = 0.95, reuseFeatures = FALSE, - altSplit = TRUE, consecutiveOneoff = TRUE) +findMarkers( + features, + class, + cellTypes, + oneoffMetric = c("modified F1", "pairwise AUC"), + threshold = 0.95, + reuseFeatures = FALSE, + altSplit = TRUE, + consecutiveOneoff = TRUE +) } \arguments{ \item{features}{A L(features) by N(samples) numeric matrix.} \item{class}{A vector of K label assignemnts.} +\item{cellTypes}{List where each element is a cell type and all the clusters +within that cell type (i.e. subtypes).} + \item{oneoffMetric}{A character string. What one-off metric to run, either `modified F1` or `pairwise AUC`.} @@ -83,20 +93,20 @@ Uses decision tree procudure to generate a set of rules for each \examples{ library(M3DExampleData) counts <- M3DExampleData::Mmus_example_list$data -# Subset 500 genes for fast clustering -counts <- as.matrix(counts[1501:2000, ]) -# Cluster genes ans samples each into 10 modules +# subset 100 genes for fast clustering +counts <- as.matrix(counts[1500:2000, ]) +# cluster genes into 10 modules for quick demo cm <- celda_CG(counts = counts, L = 10, K = 5, verbose = FALSE) # Get features matrix and cluster assignments factorized <- factorizeMatrix(counts, cm) features <- factorized$proportions$cell class <- clusters(cm)$z # Generate Decision Tree -DecTree <- buildTreeHybrid(features, - class, - oneoffMetric = "modified F1", - threshold = 1, - consecutiveOneoff = FALSE) +DecTree <- findMarkers(features, + class, + oneoffMetric = "modified F1", + threshold = 1, + consecutiveOneoff = FALSE) # Plot dendrogram plotDendro(DecTree) diff --git a/man/getDecisions.Rd b/man/getDecisions.Rd index f4289348..b9577534 100644 --- a/man/getDecisions.Rd +++ b/man/getDecisions.Rd @@ -3,12 +3,12 @@ \name{getDecisions} \alias{getDecisions} \title{Gets cluster estimates using rules generated by - `celda::buildTreeHybrid`} + `celda::findMarkers`} \usage{ getDecisions(rules, features) } \arguments{ -\item{rules}{List object. The `rules` element from `buildTreeHybrid` +\item{rules}{List object. The `rules` element from `findMarkers` output. Returns NA if cluster estimation was ambiguous.} \item{features}{A L(features) by N(samples) numeric matrix.} @@ -32,7 +32,7 @@ factorized <- factorizeMatrix(counts, cm) features <- factorized$proportions$cell class <- clusters(cm)$z # Generate Decision Tree -DecTree <- buildTreeHybrid(features, +DecTree <- findMarkers(features, class, oneoffMetric = "modified F1", threshold = 1, diff --git a/man/logLikelihoodHistory-celdaModel-method.Rd b/man/logLikelihoodHistory-celdaModel-method.Rd index 06ceffbf..ce378406 100644 --- a/man/logLikelihoodHistory-celdaModel-method.Rd +++ b/man/logLikelihoodHistory-celdaModel-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{logLikelihoodHistory,celdaModel-method} \alias{logLikelihoodHistory,celdaModel-method} \title{Get log-likelihood history} diff --git a/man/logLikelihoodcelda_CG.Rd b/man/logLikelihoodcelda_CG.Rd index 7290168e..e99dff79 100644 --- a/man/logLikelihoodcelda_CG.Rd +++ b/man/logLikelihoodcelda_CG.Rd @@ -4,8 +4,18 @@ \alias{logLikelihoodcelda_CG} \title{Calculate Celda_CG log likelihood} \usage{ -logLikelihoodcelda_CG(counts, sampleLabel, z, y, K, L, alpha, beta, delta, - gamma) +logLikelihoodcelda_CG( + counts, + sampleLabel, + z, + y, + K, + L, + alpha, + beta, + delta, + gamma +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/matrixNames-celdaModel-method.Rd b/man/matrixNames-celdaModel-method.Rd index 9c8556a7..fcfbb563 100644 --- a/man/matrixNames-celdaModel-method.Rd +++ b/man/matrixNames-celdaModel-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{matrixNames,celdaModel-method} \alias{matrixNames,celdaModel-method} \title{Get feature, cell and sample names from a celdaModel} diff --git a/man/moduleHeatmap.Rd b/man/moduleHeatmap.Rd index fab5aa9b..6307d967 100644 --- a/man/moduleHeatmap.Rd +++ b/man/moduleHeatmap.Rd @@ -4,9 +4,16 @@ \alias{moduleHeatmap} \title{Heatmap for featureModules} \usage{ -moduleHeatmap(counts, celdaMod, featureModule = 1, topCells = 100, - topFeatures = NULL, normalizedCounts = NA, scaleRow = scale, - showFeaturenames = TRUE) +moduleHeatmap( + counts, + celdaMod, + featureModule = 1, + topCells = 100, + topFeatures = NULL, + normalizedCounts = NA, + scaleRow = scale, + showFeaturenames = TRUE +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/normalizeCounts.Rd b/man/normalizeCounts.Rd index bb63644d..3bc33b8a 100644 --- a/man/normalizeCounts.Rd +++ b/man/normalizeCounts.Rd @@ -4,9 +4,14 @@ \alias{normalizeCounts} \title{Normalization of count data} \usage{ -normalizeCounts(counts, normalize = c("proportion", "cpm", "median", - "mean"), transformationFun = NULL, scaleFun = NULL, - pseudocountNormalize = 0, pseudocountTransform = 0) +normalizeCounts( + counts, + normalize = c("proportion", "cpm", "median", "mean"), + transformationFun = NULL, + scaleFun = NULL, + pseudocountNormalize = 0, + pseudocountTransform = 0 +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/params-celdaModel-method.Rd b/man/params-celdaModel-method.Rd index 740dfa0f..7715228a 100644 --- a/man/params-celdaModel-method.Rd +++ b/man/params-celdaModel-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{params,celdaModel-method} \alias{params,celdaModel-method} \title{Get parameter values provided for celdaModel creation} diff --git a/man/perplexity-celda_C-method.Rd b/man/perplexity-celda_C-method.Rd index 79cd82c8..18b60daa 100644 --- a/man/perplexity-celda_C-method.Rd +++ b/man/perplexity-celda_C-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_C.R -\docType{methods} \name{perplexity,celda_C-method} \alias{perplexity,celda_C-method} \title{Calculate the perplexity on new data with a celda_C model} diff --git a/man/perplexity-celda_CG-method.Rd b/man/perplexity-celda_CG-method.Rd index d6861632..89d1e9e5 100644 --- a/man/perplexity-celda_CG-method.Rd +++ b/man/perplexity-celda_CG-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_CG.R -\docType{methods} \name{perplexity,celda_CG-method} \alias{perplexity,celda_CG-method} \title{Calculate the perplexity on new data with a celda_CG model} diff --git a/man/perplexity-celda_G-method.Rd b/man/perplexity-celda_G-method.Rd index ffbc5967..7db11951 100644 --- a/man/perplexity-celda_G-method.Rd +++ b/man/perplexity-celda_G-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/celda_G.R -\docType{methods} \name{perplexity,celda_G-method} \alias{perplexity,celda_G-method} \title{Calculate the perplexity on new data with a celda_G model} diff --git a/man/plotDendro.Rd b/man/plotDendro.Rd index bf01daba..d9106f55 100644 --- a/man/plotDendro.Rd +++ b/man/plotDendro.Rd @@ -2,19 +2,29 @@ % Please edit documentation in R/plotDendro.R \name{plotDendro} \alias{plotDendro} -\title{Plots dendrogram of `buildTreeHybrid` output} +\title{Plots dendrogram of `findMarkers` output} \usage{ -plotDendro(decisionTree, classLabel = NULL, addSensPrec = TRUE, - leafSize = 24, boxSize = 7, boxColor = "black") +plotDendro( + decisionTree, + classLabel = NULL, + addSensPrec = FALSE, + maxFeaturePrint = 4, + leafSize = 24, + boxSize = 7, + boxColor = "black" +) } \arguments{ -\item{decisionTree}{List object. The output of `celda::buildTreeHybrid`.} +\item{decisionTree}{List object. The output of `celda::findMarkers`.} \item{classLabel}{A character value. The name of a label to which draw the path and rules. If NULL (default), the rules for every cluster is shown.} \item{addSensPrec}{Logical. Print training sensitivities and precisions -for each cluster below leaf label? Default is TRUE.} +for each cluster below leaf label? Default is FALSE.} + +\item{maxFeaturePrint}{A numeric value. Maximum number of feature IDs to +print at a given node. Default is 4.} \item{leafSize}{A numeric value. Size of text below each leaf. Default is 24.} @@ -27,7 +37,7 @@ A ggplot2 object } \description{ Generates a dendrogram of the rules and performance -(optional) of the decision tree generates by `buildTreeHybrid`. +(optional) of the decision tree generates by `findMarkers`. } \examples{ library(M3DExampleData) @@ -41,7 +51,7 @@ factorized <- factorizeMatrix(counts, cm) features <- factorized$proportions$cell class <- clusters(cm)$z # Generate Decision Tree -decTree <- buildTreeHybrid(features, +decTree <- findMarkers(features, class, oneoffMetric = "modified F1", threshold = 1, diff --git a/man/plotDimReduceCluster.Rd b/man/plotDimReduceCluster.Rd index d9756f53..e8bd96ee 100644 --- a/man/plotDimReduceCluster.Rd +++ b/man/plotDimReduceCluster.Rd @@ -4,9 +4,18 @@ \alias{plotDimReduceCluster} \title{Plotting the cell labels on a dimensionality reduction plot} \usage{ -plotDimReduceCluster(dim1, dim2, cluster, size = 1, - xlab = "Dimension_1", ylab = "Dimension_2", - specificClusters = NULL, labelClusters = FALSE, labelSize = 3.5) +plotDimReduceCluster( + dim1, + dim2, + cluster, + size = 1, + xlab = "Dimension_1", + ylab = "Dimension_2", + specificClusters = NULL, + labelClusters = FALSE, + groupBy = NULL, + labelSize = 3.5 +) } \arguments{ \item{dim1}{Numeric vector. First dimension from data @@ -31,6 +40,9 @@ If NULL, all clusters will be colored. Default NULL.} \item{labelClusters}{Logical. Whether the cluster labels are plotted. Default FALSE.} +\item{groupBy}{Character vector. Contains sample labels for each cell. +If NULL, all samples will be plotted together. Default NULL.} + \item{labelSize}{Numeric. Sets size of label if labelClusters is TRUE. Default 3.5.} } diff --git a/man/plotDimReduceFeature.Rd b/man/plotDimReduceFeature.Rd index 37fe4f76..812e7733 100644 --- a/man/plotDimReduceFeature.Rd +++ b/man/plotDimReduceFeature.Rd @@ -4,10 +4,23 @@ \alias{plotDimReduceFeature} \title{Plotting feature expression on a dimensionality reduction plot} \usage{ -plotDimReduceFeature(dim1, dim2, counts, features, normalize = TRUE, - exactMatch = TRUE, trim = c(-2, 2), size = 1, - xlab = "Dimension_1", ylab = "Dimension_2", colorLow = "grey", - colorMid = NULL, colorHigh = "blue") +plotDimReduceFeature( + dim1, + dim2, + counts, + features, + headers = NULL, + normalize = TRUE, + exactMatch = TRUE, + trim = c(-2, 2), + size = 1, + xlab = "Dimension_1", + ylab = "Dimension_2", + colorLow = "blue", + colorMid = "white", + colorHigh = "red", + ncol = NULL +) } \arguments{ \item{dim1}{Numeric vector. First dimension from data @@ -21,6 +34,9 @@ represent cells.} \item{features}{Character vector. Uses these genes for plotting.} +\item{headers}{Character vector. If `NULL`, the corresponding rownames are +used as labels. Otherwise, these headers are used to label the genes.} + \item{normalize}{Logical. Whether to normalize the columns of `counts`. Default TRUE.} @@ -39,13 +55,16 @@ Set to NULL to disable. Default c(-2,2).} \item{ylab}{Character vector. Label for the y-axis. Default "Dimension_2".} \item{colorLow}{Character. A color available from `colors()`. The color -will be used to signify the lowest values on the scale. Default 'grey'.} +will be used to signify the lowest values on the scale. Default 'blue'.} \item{colorMid}{Character. A color available from `colors()`. The color -will be used to signify the midpoint on the scale.} +will be used to signify the midpoint on the scale. Default 'white'.} \item{colorHigh}{Character. A color available from `colors()`. The color -will be used to signify the highest values on the scale. Default 'blue'.} +will be used to signify the highest values on the scale. Default 'red'.} + +\item{ncol}{Integer. Passed to \link[ggplot2]{facet_wrap}. Specify the +number of columns for facet wrap.} } \value{ The plot as a ggplot object diff --git a/man/plotDimReduceGrid.Rd b/man/plotDimReduceGrid.Rd index 164544c6..e6afc0b6 100644 --- a/man/plotDimReduceGrid.Rd +++ b/man/plotDimReduceGrid.Rd @@ -4,8 +4,20 @@ \alias{plotDimReduceGrid} \title{Mapping the dimensionality reduction plot} \usage{ -plotDimReduceGrid(dim1, dim2, matrix, size, xlab, ylab, colorLow, colorMid, - colorHigh, varLabel) +plotDimReduceGrid( + dim1, + dim2, + matrix, + size, + xlab, + ylab, + colorLow, + colorMid, + colorHigh, + varLabel, + ncol = NULL, + headers = NULL +) } \arguments{ \item{dim1}{Numeric vector. First dimension from data dimensionality @@ -32,9 +44,15 @@ The color will be used to signify the midpoint on the scale.} \item{colorHigh}{Character. A color available from `colors()`. The color will be used to signify the highest values on the scale. - Default 'blue'.} +Default 'blue'.} \item{varLabel}{Character vector. Title for the color legend.} + +\item{ncol}{Integer. Passed to \link[ggplot2]{facet_wrap}. Specify the +number of columns for facet wrap.} + +\item{headers}{Character vector. If `NULL`, the corresponding rownames are +used as labels. Otherwise, these headers are used to label the genes.} } \value{ The plot as a ggplot object diff --git a/man/plotDimReduceModule.Rd b/man/plotDimReduceModule.Rd index f1082752..1fc90c71 100644 --- a/man/plotDimReduceModule.Rd +++ b/man/plotDimReduceModule.Rd @@ -5,10 +5,21 @@ \title{Plotting the Celda module probability on a dimensionality reduction plot} \usage{ -plotDimReduceModule(dim1, dim2, counts, celdaMod, modules = NULL, - rescale = TRUE, size = 1, xlab = "Dimension_1", - ylab = "Dimension_2", colorLow = "grey", colorMid = NULL, - colorHigh = "blue") +plotDimReduceModule( + dim1, + dim2, + counts, + celdaMod, + modules = NULL, + rescale = TRUE, + size = 1, + xlab = "Dimension_1", + ylab = "Dimension_2", + colorLow = "grey", + colorMid = NULL, + colorHigh = "blue", + ncol = NULL +) } \arguments{ \item{dim1}{Numeric vector. @@ -45,6 +56,9 @@ The color will be used to signify the midpoint on the scale.} \item{colorHigh}{Character. A color available from `colors()`. The color will be used to signify the highest values on the scale. Default 'blue'.} + +\item{ncol}{Integer. Passed to \link[ggplot2]{facet_wrap}. Specify the +number of columns for facet wrap.} } \value{ The plot as a ggplot object diff --git a/man/plotHeatmap.Rd b/man/plotHeatmap.Rd index 9cdefd74..9dbac3a2 100644 --- a/man/plotHeatmap.Rd +++ b/man/plotHeatmap.Rd @@ -4,18 +4,38 @@ \alias{plotHeatmap} \title{Plots heatmap based on Celda model} \usage{ -plotHeatmap(counts, z = NULL, y = NULL, scaleRow = scale, - trim = c(-2, 2), featureIx = NULL, cellIx = NULL, - clusterFeature = TRUE, clusterCell = TRUE, +plotHeatmap( + counts, + z = NULL, + y = NULL, + rowGroupOrder = NULL, + colGroupOrder = NULL, + scaleRow = scale, + trim = c(-2, 2), + featureIx = NULL, + cellIx = NULL, + clusterFeature = TRUE, + clusterCell = TRUE, colorScheme = c("divergent", "sequential"), - colorSchemeSymmetric = TRUE, colorSchemeCenter = 0, col = NULL, - annotationCell = NULL, annotationFeature = NULL, - annotationColor = NULL, breaks = NULL, legend = TRUE, - annotationLegend = TRUE, annotationNamesFeature = TRUE, - annotationNamesCell = TRUE, showNamesFeature = FALSE, - showNamesCell = FALSE, hclustMethod = "ward.D2", + colorSchemeSymmetric = TRUE, + colorSchemeCenter = 0, + col = NULL, + annotationCell = NULL, + annotationFeature = NULL, + annotationColor = NULL, + breaks = NULL, + legend = TRUE, + annotationLegend = TRUE, + annotationNamesFeature = TRUE, + annotationNamesCell = TRUE, + showNamesFeature = FALSE, + showNamesCell = FALSE, + hclustMethod = "ward.D2", treeheightFeature = ifelse(clusterFeature, 50, 0), - treeheightCell = ifelse(clusterCell, 50, 0), silent = FALSE, ...) + treeheightCell = ifelse(clusterCell, 50, 0), + silent = FALSE, + ... +) } \arguments{ \item{counts}{Numeric matrix. Normalized counts matrix where rows represent diff --git a/man/recursiveSplitCell.Rd b/man/recursiveSplitCell.Rd index ad215cd6..1d8acf15 100644 --- a/man/recursiveSplitCell.Rd +++ b/man/recursiveSplitCell.Rd @@ -4,10 +4,23 @@ \alias{recursiveSplitCell} \title{Recursive cell splitting} \usage{ -recursiveSplitCell(counts, sampleLabel = NULL, initialK = 5, - maxK = 25, tempL = NULL, yInit = NULL, alpha = 1, beta = 1, - delta = 1, gamma = 1, minCell = 3, reorder = TRUE, - perplexity = TRUE, logfile = NULL, verbose = TRUE) +recursiveSplitCell( + counts, + sampleLabel = NULL, + initialK = 5, + maxK = 25, + tempL = NULL, + yInit = NULL, + alpha = 1, + beta = 1, + delta = 1, + gamma = 1, + minCell = 3, + reorder = TRUE, + perplexity = TRUE, + logfile = NULL, + verbose = TRUE +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/recursiveSplitModule.Rd b/man/recursiveSplitModule.Rd index b9943642..c397d8b9 100644 --- a/man/recursiveSplitModule.Rd +++ b/man/recursiveSplitModule.Rd @@ -4,10 +4,23 @@ \alias{recursiveSplitModule} \title{Recursive module splitting} \usage{ -recursiveSplitModule(counts, initialL = 10, maxL = 100, tempK = 100, - zInit = NULL, sampleLabel = NULL, alpha = 1, beta = 1, - delta = 1, gamma = 1, minFeature = 3, reorder = TRUE, - perplexity = TRUE, verbose = TRUE, logfile = NULL) +recursiveSplitModule( + counts, + initialL = 10, + maxL = 100, + tempK = 100, + zInit = NULL, + sampleLabel = NULL, + alpha = 1, + beta = 1, + delta = 1, + gamma = 1, + minFeature = 3, + reorder = TRUE, + perplexity = TRUE, + verbose = TRUE, + logfile = NULL +) } \arguments{ \item{counts}{Integer matrix. Rows represent features and columns represent diff --git a/man/resList-celdaList-method.Rd b/man/resList-celdaList-method.Rd index 5b622841..3b644935 100644 --- a/man/resList-celdaList-method.Rd +++ b/man/resList-celdaList-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{resList,celdaList-method} \alias{resList,celdaList-method} \title{Get final celdaModels from a celdaList} diff --git a/man/runParams-celdaList-method.Rd b/man/runParams-celdaList-method.Rd index 29228807..8b4c84f8 100644 --- a/man/runParams-celdaList-method.Rd +++ b/man/runParams-celdaList-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{runParams,celdaList-method} \alias{runParams,celdaList-method} \title{Get run parameters provided to `celdaGridSearch()`} diff --git a/man/sampleLabel-celdaModel-method.Rd b/man/sampleLabel-celdaModel-method.Rd index 049266dc..1e39a984 100644 --- a/man/sampleLabel-celdaModel-method.Rd +++ b/man/sampleLabel-celdaModel-method.Rd @@ -1,6 +1,5 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/all_generics.R -\docType{methods} \name{sampleLabel,celdaModel-method} \alias{sampleLabel,celdaModel-method} \title{Get sampleLabels from a celdaModel} diff --git a/man/semiPheatmap.Rd b/man/semiPheatmap.Rd index fae1d21e..99d2baed 100644 --- a/man/semiPheatmap.Rd +++ b/man/semiPheatmap.Rd @@ -4,26 +4,60 @@ \alias{semiPheatmap} \title{A function to draw clustered heatmaps.} \usage{ -semiPheatmap(mat, color = colorRampPalette(rev(brewer.pal(n = 7, name = - "RdYlBu")))(100), kmeansK = NA, breaks = NA, - borderColor = "grey60", cellWidth = NA, cellHeight = NA, - scale = "none", clusterRows = TRUE, clusterCols = TRUE, +semiPheatmap( + mat, + color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100), + kmeansK = NA, + breaks = NA, + borderColor = "grey60", + cellWidth = NA, + cellHeight = NA, + scale = "none", + clusterRows = TRUE, + clusterCols = TRUE, clusteringDistanceRows = "euclidean", - clusteringDistanceCols = "euclidean", clusteringMethod = "complete", - clusteringCallback = .identity2, cutreeRows = NA, cutreeCols = NA, + clusteringDistanceCols = "euclidean", + clusteringMethod = "complete", + clusteringCallback = .identity2, + cutreeRows = NA, + cutreeCols = NA, treeHeightRow = ifelse(clusterRows, 50, 0), - treeHeightCol = ifelse(clusterCols, 50, 0), legend = TRUE, - legendBreaks = NA, legendLabels = NA, annotationRow = NA, - annotationCol = NA, annotation = NA, annotationColors = NA, - annotationLegend = TRUE, annotationNamesRow = TRUE, - annotationNamesCol = TRUE, dropLevels = TRUE, showRownames = TRUE, - showColnames = TRUE, main = NA, fontSize = 10, - fontSizeRow = fontSize, fontSizeCol = fontSize, - displayNumbers = FALSE, numberFormat = "\%.2f", - numberColor = "grey30", fontSizeNumber = 0.8 * fontSize, - gapsRow = NULL, gapsCol = NULL, labelsRow = NULL, - labelsCol = NULL, fileName = NA, width = NA, height = NA, - silent = FALSE, rowLabel, colLabel, ...) + treeHeightCol = ifelse(clusterCols, 50, 0), + legend = TRUE, + legendBreaks = NA, + legendLabels = NA, + annotationRow = NA, + annotationCol = NA, + annotation = NA, + annotationColors = NA, + annotationLegend = TRUE, + annotationNamesRow = TRUE, + annotationNamesCol = TRUE, + dropLevels = TRUE, + showRownames = TRUE, + showColnames = TRUE, + main = NA, + fontSize = 10, + fontSizeRow = fontSize, + fontSizeCol = fontSize, + displayNumbers = FALSE, + numberFormat = "\%.2f", + numberColor = "grey30", + fontSizeNumber = 0.8 * fontSize, + gapsRow = NULL, + gapsCol = NULL, + labelsRow = NULL, + labelsCol = NULL, + fileName = NA, + width = NA, + height = NA, + silent = FALSE, + rowLabel, + colLabel, + rowGroupOrder = NULL, + colGroupOrder = NULL, + ... +) } \arguments{ \item{mat}{numeric matrix of the values to be plotted.} @@ -276,7 +310,7 @@ pheatmap(test, clusteringDistanceCols = dcols) # Modify ordering of the clusters using clustering callback option -callback = function(hc, mat){ +callback = function(hc, mat) { sv = svd(t(mat))$v[, 1] dend = reorder(as.dendrogram(hc), wts = sv) as.hclust(dend) diff --git a/man/simulateCellscelda_C.Rd b/man/simulateCellscelda_C.Rd index 75aa0b6e..d366729d 100644 --- a/man/simulateCellscelda_C.Rd +++ b/man/simulateCellscelda_C.Rd @@ -4,9 +4,18 @@ \alias{simulateCellscelda_C} \title{Simulate cells from the celda_C model} \usage{ -simulateCellscelda_C(model, S = 5, CRange = c(50, 100), - NRange = c(500, 1000), G = 100, K = 5, alpha = 1, beta = 1, - seed = 12345, ...) +simulateCellscelda_C( + model, + S = 5, + CRange = c(50, 100), + NRange = c(500, 1000), + G = 100, + K = 5, + alpha = 1, + beta = 1, + seed = 12345, + ... +) } \arguments{ \item{model}{Character. Options available in `celda::availableModels`.} diff --git a/man/simulateCellscelda_CG.Rd b/man/simulateCellscelda_CG.Rd index 2e6b167a..64f93062 100644 --- a/man/simulateCellscelda_CG.Rd +++ b/man/simulateCellscelda_CG.Rd @@ -4,9 +4,21 @@ \alias{simulateCellscelda_CG} \title{Simulate cells from the celda_CG model} \usage{ -simulateCellscelda_CG(model, S = 5, CRange = c(50, 100), - NRange = c(500, 1000), G = 100, K = 5, L = 10, alpha = 1, - beta = 1, gamma = 5, delta = 1, seed = 12345, ...) +simulateCellscelda_CG( + model, + S = 5, + CRange = c(50, 100), + NRange = c(500, 1000), + G = 100, + K = 5, + L = 10, + alpha = 1, + beta = 1, + gamma = 5, + delta = 1, + seed = 12345, + ... +) } \arguments{ \item{model}{Character. Options available in `celda::availableModels`.} diff --git a/man/simulateCellscelda_G.Rd b/man/simulateCellscelda_G.Rd index 275bba79..3ecd1aae 100644 --- a/man/simulateCellscelda_G.Rd +++ b/man/simulateCellscelda_G.Rd @@ -4,8 +4,18 @@ \alias{simulateCellscelda_G} \title{Simulate cells from the celda_G model} \usage{ -simulateCellscelda_G(model, C = 100, NRange = c(500, 1000), G = 100, - L = 10, beta = 1, gamma = 5, delta = 1, seed = 12345, ...) +simulateCellscelda_G( + model, + C = 100, + NRange = c(500, 1000), + G = 100, + L = 10, + beta = 1, + gamma = 5, + delta = 1, + seed = 12345, + ... +) } \arguments{ \item{model}{Character. Options available in `celda::availableModels`.} diff --git a/man/simulateContaminatedMatrix.Rd b/man/simulateContaminatedMatrix.Rd index adee2589..db39fcc9 100644 --- a/man/simulateContaminatedMatrix.Rd +++ b/man/simulateContaminatedMatrix.Rd @@ -4,8 +4,15 @@ \alias{simulateContaminatedMatrix} \title{Simulate contaminated count matrix} \usage{ -simulateContaminatedMatrix(C = 300, G = 100, K = 3, NRange = c(500, - 1000), beta = 0.5, delta = c(1, 2), seed = 12345) +simulateContaminatedMatrix( + C = 300, + G = 100, + K = 3, + NRange = c(500, 1000), + beta = 0.5, + delta = c(1, 2), + seed = 12345 +) } \arguments{ \item{C}{Integer. Number of cells to be simulated. Default to be 300.} diff --git a/man/topRank.Rd b/man/topRank.Rd index ff3bbb3b..355be1e7 100644 --- a/man/topRank.Rd +++ b/man/topRank.Rd @@ -4,8 +4,7 @@ \alias{topRank} \title{Identify features with the highest influence on clustering.} \usage{ -topRank(matrix, n = 25, margin = 2, threshold = 0, - decreasing = TRUE) +topRank(matrix, n = 25, margin = 2, threshold = 0, decreasing = TRUE) } \arguments{ \item{matrix}{Numeric matrix.} @@ -18,7 +17,7 @@ columns. Default 2.} \item{threshold}{Numeric. Only return ranked rows or columns in the matrix that are above this threshold. If NULL, then no threshold will be applied. -Default 1.} +Default 0.} \item{decreasing}{Logical. Specifies if the rank should be decreasing. Default TRUE.} diff --git a/src/DecontX.cpp b/src/DecontX.cpp new file mode 100644 index 00000000..c339c23a --- /dev/null +++ b/src/DecontX.cpp @@ -0,0 +1,296 @@ +#include +// [[Rcpp::depends(RcppArmadillo)]] +using namespace Rcpp; using namespace arma; + + + +// [[Rcpp::export]] +Rcpp::List decontXEM(const arma::sp_mat& counts, + const NumericVector& counts_colsums, + const NumericVector& theta, + const NumericMatrix& eta, + const NumericMatrix& phi, + const IntegerVector& z, + const double& pseudocount) { + + // Perform error checking + if(counts.n_cols != theta.size()) { + stop("Length of 'theta' must be equal to the number of columns in 'counts'."); + } + if(counts.n_cols != z.size()) { + stop("Length of 'z' must be equal to the number of columns in 'counts'."); + } + if(counts.n_cols != counts_colsums.size()) { + stop("Length of 'counts_colsums' must be equal to the number of columns in 'counts'."); + } + if(counts.n_rows != phi.nrow()) { + stop("The number of rows in 'phi' must be equal to the number of rows in 'counts'."); + } + if(counts.n_rows != eta.nrow()) { + stop("The number of rows in 'eta' must be equal to the number of rows in 'counts'."); + } + if(phi.ncol() != eta.ncol()) { + stop("The number of columns in 'eta' must be equal to the number of columns in 'phi'."); + } + if(min(z) < 1 || max(z) > eta.ncol()) { + stop("The entries in 'z' need to be between 1 and the number of columns in eta and phi."); + } + + // Declare variables and functions + NumericVector new_theta(theta.size()); + NumericVector native_total(theta.size()); + NumericMatrix new_phi(phi.nrow(), phi.ncol()); + NumericMatrix new_eta(eta.nrow(), eta.ncol()); + +// std::fill(new_phi.begin(), new_phi.end(), pseudocount); +// std::fill(new_eta.begin(), new_eta.end(), pseudocount); + + // Obtaining 'fit_dirichlet' function from MCMCprecision package + Environment pkg = Environment::namespace_env("MCMCprecision"); + Function f = pkg["fit_dirichlet"]; + + int i; + int j; + int k; + int nr = phi.nrow(); + double x; + double pcontamin; + double pnative; + double normp; + double px; + for (arma::sp_mat::const_iterator it = counts.begin(); it != counts.end(); ++it) { + i = it.row(); + j = it.col(); + x = *it; + k = z[j] - 1; + + // Calculate variational probabilities + // Removing the log/exp speeds it up and produces the same result since + // there are only 2 probabilities being multiplied + + //pnative = log(phi(i,k) + pseudocount) + log(theta(j) + pseudocount); + //pcontamin = log(eta(i,k) + pseudocount) + log(1 - theta(j) + pseudocount); + pnative = (phi[nr * k + i] + pseudocount) * (theta[j] + pseudocount); + pcontamin = (eta[nr * k + i] + pseudocount) * (1 - theta[j] + pseudocount); + + // Normalize probabilities and add to proper components + //normp = exp(pnative) / (exp(pcontamin) + exp(pnative)); + normp = pnative / (pcontamin + pnative); + px = normp * x; + new_phi(i,k) += px; + native_total(j) += px; + } + + // Calculate Eta using Weights from Phi + NumericVector phi_rowsum = rowSums(new_phi); + for(i = 0; i < new_eta.ncol(); i++) { + for(j = 0; j < new_eta.nrow(); j++) { + new_eta(j,i) = phi_rowsum[j] - new_phi(j,i); + } + } + + // Normalize Phi and Eta + NumericVector phi_colsum = colSums(new_phi); + NumericVector eta_colsum = colSums(new_eta); + for(i = 0; i < new_phi.ncol(); i++) { + new_phi(_,i) = new_phi(_,i) / phi_colsum[i]; + new_eta(_,i) = new_eta(_,i) / eta_colsum[i]; + } + + // Update Theta + NumericVector contamination_prop = (counts_colsums - native_total) / counts_colsums; + NumericVector native_prop = 1 - contamination_prop; + NumericMatrix theta_raw = cbind(native_prop, contamination_prop); + + Rcpp::List result = f(Named("x", theta_raw)); + NumericVector delta = result["alpha"]; + new_theta = (native_total + delta[0]) / (counts_colsums + result["sum"]); + + return Rcpp::List::create(Rcpp::Named("phi") = new_phi, + Rcpp::Named("eta") = new_eta, + Rcpp::Named("theta") = new_theta, + Rcpp::Named("delta") = delta, + Rcpp::Named("contamination") = contamination_prop); +} + + + + +// [[Rcpp::export]] +double decontXLogLik(const arma::sp_mat& counts, + const NumericVector& theta, + const NumericMatrix& eta, + const NumericMatrix& phi, + const IntegerVector& z, + const double& pseudocount) { + + // Perform error checking + if(counts.n_cols != theta.size()) { + stop("Length of 'theta' must be equal to the number of columns in 'counts'."); + } + if(counts.n_cols != z.size()) { + stop("Length of 'z' must be equal to the number of columns in 'counts'."); + } + if(counts.n_rows != phi.nrow()) { + stop("The number of rows in 'phi' must be equal to the number of rows in 'counts'."); + } + if(counts.n_rows != eta.nrow()) { + stop("The number of rows in 'eta' must be equal to the number of rows in 'counts'."); + } + if(phi.ncol() != eta.ncol()) { + stop("The number of columns in 'eta' must be equal to the number of columns in 'phi'."); + } + if(min(z) < 1 || max(z) > eta.ncol()) { + stop("The entries in 'z' need to be between 1 and the number of columns in eta and phi."); + } + + // Declare variables and functions + double loglik = 0; + + int i; + int j; + int k; + double x; + int nr = phi.nrow(); + + // Original R code: + // ll <- sum(Matrix::t(counts) * log(theta * t(phi)[z, ] + + // (1 - theta) * t(eta)[z, ] + 1e-20)) + + for (arma::sp_mat::const_iterator it = counts.begin(); it != counts.end(); ++it) { + i = it.row(); + j = it.col(); + x = *it; + k = z[j] - 1; + + loglik += x * log((phi[nr * k + i] * theta[j]) + (eta[nr * k + i] * (1 - theta[j])) + pseudocount); + } + + return loglik; +} + + + +// [[Rcpp::export]] +Rcpp::List decontXInitialize(const arma::sp_mat& counts, + const NumericVector& theta, + const IntegerVector& z, + const double& pseudocount) { + + // Perform error checking + if(counts.n_cols != theta.size()) { + stop("Length of 'theta' must be equal to the number of columns in 'counts'."); + } + if(counts.n_cols != z.size()) { + stop("Length of 'z' must be equal to the number of columns in 'counts'."); + } + + // Declare variables and functions + NumericMatrix new_phi(counts.n_rows, max(z)); + NumericMatrix new_eta(counts.n_rows, max(z)); + std::fill(new_phi.begin(), new_phi.end(), pseudocount); + std::fill(new_eta.begin(), new_eta.end(), pseudocount); + + int i; + int j; + int k; + double x; + + for (arma::sp_mat::const_iterator it = counts.begin(); it != counts.end(); ++it) { + i = it.row(); + j = it.col(); + x = *it; + k = z[j] - 1; + + new_phi(i,k) += x * theta(j); + } + + // Calculate Eta using Weights from Phi + NumericVector phi_rowsum = rowSums(new_phi); + for(i = 0; i < new_eta.ncol(); i++) { + for(j = 0; j < new_eta.nrow(); j++) { + new_eta(j,i) = phi_rowsum[j] - new_phi(j,i); + } + } + + // Normalize Phi and Eta + NumericVector phi_colsum = colSums(new_phi); + NumericVector eta_colsum = colSums(new_eta); + for(i = 0; i < new_phi.ncol(); i++) { + new_phi(_,i) = new_phi(_,i) / phi_colsum[i]; + new_eta(_,i) = new_eta(_,i) / eta_colsum[i]; + } + + return Rcpp::List::create(Rcpp::Named("phi") = new_phi, + Rcpp::Named("eta") = new_eta); + +} + + + + +// [[Rcpp::export]] +arma::sp_mat calculateNativeMatrix(const arma::sp_mat& counts, + const arma::sp_mat& native_counts, + const NumericVector& theta, + const NumericMatrix& eta, + const NumericMatrix& phi, + const IntegerVector& z, + const IntegerVector row_index, + const IntegerVector col_index, + const double& pseudocount) { + + // Perform error checking + if(counts.n_cols != theta.size()) { + stop("Length of 'theta' must be equal to the number of columns in 'counts'."); + } + if(counts.n_cols != z.size()) { + stop("Length of 'z' must be equal to the number of columns in 'counts'."); + } + if(counts.n_rows != phi.nrow()) { + stop("The number of rows in 'phi' must be equal to the number of rows in 'counts'."); + } + if(counts.n_rows != eta.nrow()) { + stop("The number of rows in 'eta' must be equal to the number of rows in 'counts'."); + } + if(phi.ncol() != eta.ncol()) { + stop("The number of columns in 'eta' must be equal to the number of columns in 'phi'."); + } + if(min(z) < 1 || max(z) > eta.ncol()) { + stop("The entries in 'z' need to be between 1 and the number of columns in eta and phi."); + } + if(max(row_index) - 1 > native_counts.n_rows || min(row_index) < 0) { + stop("The entries in 'row_index' need to be less than 'nrow(native_counts)' and greater than 0."); + } + if(max(col_index) - 1 > native_counts.n_cols || min(col_index) < 0) { + stop("The entries in 'row_index' need to be less than 'ncol(native_counts)' and greater than 0."); + } + + arma::sp_mat native_matrix = native_counts; + + int i; + int j; + int k; + double x; + double pcontamin; + double pnative; + double normp; + for (arma::sp_mat::const_iterator it = counts.begin(); it != counts.end(); ++it) { + i = it.row(); + j = it.col(); + x = *it; + k = z[j] - 1; + + // Calculate variational probabilities + pnative = log(phi(i,k) + pseudocount) + log(theta(j) + pseudocount); + pcontamin = log(eta(i,k) + pseudocount) + log(1 - theta(j) + pseudocount); + + // Normalize probabilities and add to proper components + normp = exp(pnative) / (exp(pcontamin) + exp(pnative)); + native_matrix.at(row_index(i) - 1, col_index(j) - 1) = normp * x; + } + + return native_matrix; +} + + diff --git a/src/Makevars b/src/Makevars index 8fa55139..d3e3f414 100755 --- a/src/Makevars +++ b/src/Makevars @@ -1,7 +1,14 @@ -## With Rcpp 0.11.0 and later, we no longer need to set PKG_LIBS as there is -## no user-facing library. The include path to headers is already set by R. -#PKG_LIBS = ## With R 3.1.0 or later, you can uncomment the following line to tell R to -## enable compilation with C++11 (or even C++14) where available -#CXX_STD = CXX11 +## enable compilation with C++11 (where available) +## +## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider +## availability of the package we do not yet enforce this here. It is however +## recommended for client packages to set it. +## +## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP +## support within Armadillo prefers / requires it +CXX_STD = CXX11 + +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/Makevars.win b/src/Makevars.win index 8fa55139..d3e3f414 100755 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,7 +1,14 @@ -## With Rcpp 0.11.0 and later, we no longer need to set PKG_LIBS as there is -## no user-facing library. The include path to headers is already set by R. -#PKG_LIBS = ## With R 3.1.0 or later, you can uncomment the following line to tell R to -## enable compilation with C++11 (or even C++14) where available -#CXX_STD = CXX11 +## enable compilation with C++11 (where available) +## +## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider +## availability of the package we do not yet enforce this here. It is however +## recommended for client packages to set it. +## +## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP +## support within Armadillo prefers / requires it +CXX_STD = CXX11 + +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 37a8a6b7..34d48cda 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -1,11 +1,78 @@ // Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +#include #include #include using namespace Rcpp; +// decontXEM +Rcpp::List decontXEM(const arma::sp_mat& counts, const NumericVector& counts_colsums, const NumericVector& theta, const NumericMatrix& eta, const NumericMatrix& phi, const IntegerVector& z, const double& pseudocount); +RcppExport SEXP _celda_decontXEM(SEXP countsSEXP, SEXP counts_colsumsSEXP, SEXP thetaSEXP, SEXP etaSEXP, SEXP phiSEXP, SEXP zSEXP, SEXP pseudocountSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::sp_mat& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type counts_colsums(counts_colsumsSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type theta(thetaSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type eta(etaSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type phi(phiSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type z(zSEXP); + Rcpp::traits::input_parameter< const double& >::type pseudocount(pseudocountSEXP); + rcpp_result_gen = Rcpp::wrap(decontXEM(counts, counts_colsums, theta, eta, phi, z, pseudocount)); + return rcpp_result_gen; +END_RCPP +} +// decontXLogLik +double decontXLogLik(const arma::sp_mat& counts, const NumericVector& theta, const NumericMatrix& eta, const NumericMatrix& phi, const IntegerVector& z, const double& pseudocount); +RcppExport SEXP _celda_decontXLogLik(SEXP countsSEXP, SEXP thetaSEXP, SEXP etaSEXP, SEXP phiSEXP, SEXP zSEXP, SEXP pseudocountSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::sp_mat& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type theta(thetaSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type eta(etaSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type phi(phiSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type z(zSEXP); + Rcpp::traits::input_parameter< const double& >::type pseudocount(pseudocountSEXP); + rcpp_result_gen = Rcpp::wrap(decontXLogLik(counts, theta, eta, phi, z, pseudocount)); + return rcpp_result_gen; +END_RCPP +} +// decontXInitialize +Rcpp::List decontXInitialize(const arma::sp_mat& counts, const NumericVector& theta, const IntegerVector& z, const double& pseudocount); +RcppExport SEXP _celda_decontXInitialize(SEXP countsSEXP, SEXP thetaSEXP, SEXP zSEXP, SEXP pseudocountSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::sp_mat& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type theta(thetaSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type z(zSEXP); + Rcpp::traits::input_parameter< const double& >::type pseudocount(pseudocountSEXP); + rcpp_result_gen = Rcpp::wrap(decontXInitialize(counts, theta, z, pseudocount)); + return rcpp_result_gen; +END_RCPP +} +// calculateNativeMatrix +arma::sp_mat calculateNativeMatrix(const arma::sp_mat& counts, const arma::sp_mat& native_counts, const NumericVector& theta, const NumericMatrix& eta, const NumericMatrix& phi, const IntegerVector& z, const IntegerVector row_index, const IntegerVector col_index, const double& pseudocount); +RcppExport SEXP _celda_calculateNativeMatrix(SEXP countsSEXP, SEXP native_countsSEXP, SEXP thetaSEXP, SEXP etaSEXP, SEXP phiSEXP, SEXP zSEXP, SEXP row_indexSEXP, SEXP col_indexSEXP, SEXP pseudocountSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::sp_mat& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const arma::sp_mat& >::type native_counts(native_countsSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type theta(thetaSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type eta(etaSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type phi(phiSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type z(zSEXP); + Rcpp::traits::input_parameter< const IntegerVector >::type row_index(row_indexSEXP); + Rcpp::traits::input_parameter< const IntegerVector >::type col_index(col_indexSEXP); + Rcpp::traits::input_parameter< const double& >::type pseudocount(pseudocountSEXP); + rcpp_result_gen = Rcpp::wrap(calculateNativeMatrix(counts, native_counts, theta, eta, phi, z, row_index, col_index, pseudocount)); + return rcpp_result_gen; +END_RCPP +} // cG_calcGibbsProbY_Simple NumericVector cG_calcGibbsProbY_Simple(const IntegerMatrix counts, IntegerVector nGbyTS, IntegerMatrix nTSbyC, IntegerVector nbyTS, IntegerVector nbyG, const IntegerVector y, const int L, const int index, const double gamma, const double beta, const double delta); RcppExport SEXP _celda_cG_calcGibbsProbY_Simple(SEXP countsSEXP, SEXP nGbyTSSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP indexSEXP, SEXP gammaSEXP, SEXP betaSEXP, SEXP deltaSEXP) { @@ -27,6 +94,52 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// cG_CalcGibbsProbY_ori +NumericVector cG_CalcGibbsProbY_ori(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); +RcppExport SEXP _celda_cG_CalcGibbsProbY_ori(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const int >::type index(indexSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix& >::type nTSbyC(nTSbyCSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nbyTS(nbyTSSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nGbyTS(nGbyTSSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nbyG(nbyGSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type y(ySEXP); + Rcpp::traits::input_parameter< const int >::type L(LSEXP); + Rcpp::traits::input_parameter< const int >::type nG(nGSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_beta(lg_betaSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_gamma(lg_gammaSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_delta(lg_deltaSEXP); + Rcpp::traits::input_parameter< const double >::type delta(deltaSEXP); + rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY_ori(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); + return rcpp_result_gen; +END_RCPP +} +// cG_CalcGibbsProbY_fastRow +NumericVector cG_CalcGibbsProbY_fastRow(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); +RcppExport SEXP _celda_cG_CalcGibbsProbY_fastRow(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const int >::type index(indexSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix& >::type counts(countsSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix& >::type nTSbyC(nTSbyCSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nbyTS(nbyTSSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nGbyTS(nGbyTSSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nbyG(nbyGSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type y(ySEXP); + Rcpp::traits::input_parameter< const int >::type L(LSEXP); + Rcpp::traits::input_parameter< const int >::type nG(nGSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_beta(lg_betaSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_gamma(lg_gammaSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type lg_delta(lg_deltaSEXP); + Rcpp::traits::input_parameter< const double >::type delta(deltaSEXP); + rcpp_result_gen = Rcpp::wrap(cG_CalcGibbsProbY_fastRow(index, counts, nTSbyC, nbyTS, nGbyTS, nbyG, y, L, nG, lg_beta, lg_gamma, lg_delta, delta)); + return rcpp_result_gen; +END_RCPP +} // cG_CalcGibbsProbY NumericVector cG_CalcGibbsProbY(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, const IntegerVector& nGbyTS, const IntegerVector& nbyG, const IntegerVector& y, const int L, const int nG, const NumericVector& lg_beta, const NumericVector& lg_gamma, const NumericVector& lg_delta, const double delta); RcppExport SEXP _celda_cG_CalcGibbsProbY(SEXP indexSEXP, SEXP countsSEXP, SEXP nTSbyCSEXP, SEXP nbyTSSEXP, SEXP nGbyTSSEXP, SEXP nbyGSEXP, SEXP ySEXP, SEXP LSEXP, SEXP nGSEXP, SEXP lg_betaSEXP, SEXP lg_gammaSEXP, SEXP lg_deltaSEXP, SEXP deltaSEXP) { @@ -108,7 +221,13 @@ RcppExport SEXP _rowSumByGroup_numeric(SEXP, SEXP); RcppExport SEXP _rowSumByGroupChange(SEXP, SEXP, SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { + {"_celda_decontXEM", (DL_FUNC) &_celda_decontXEM, 7}, + {"_celda_decontXLogLik", (DL_FUNC) &_celda_decontXLogLik, 6}, + {"_celda_decontXInitialize", (DL_FUNC) &_celda_decontXInitialize, 4}, + {"_celda_calculateNativeMatrix", (DL_FUNC) &_celda_calculateNativeMatrix, 9}, {"_celda_cG_calcGibbsProbY_Simple", (DL_FUNC) &_celda_cG_calcGibbsProbY_Simple, 11}, + {"_celda_cG_CalcGibbsProbY_ori", (DL_FUNC) &_celda_cG_CalcGibbsProbY_ori, 13}, + {"_celda_cG_CalcGibbsProbY_fastRow", (DL_FUNC) &_celda_cG_CalcGibbsProbY_fastRow, 13}, {"_celda_cG_CalcGibbsProbY", (DL_FUNC) &_celda_cG_CalcGibbsProbY, 13}, {"_celda_eigenMatMultInt", (DL_FUNC) &_celda_eigenMatMultInt, 2}, {"_celda_fastNormProp", (DL_FUNC) &_celda_fastNormProp, 2}, diff --git a/src/cG_calcGibbsProbY.cpp b/src/cG_calcGibbsProbY.cpp index 904bd502..c7a3f609 100644 --- a/src/cG_calcGibbsProbY.cpp +++ b/src/cG_calcGibbsProbY.cpp @@ -50,7 +50,7 @@ NumericVector cG_calcGibbsProbY_Simple(const IntegerMatrix counts, // [[Rcpp::export]] -NumericVector cG_CalcGibbsProbY(const int index, +NumericVector cG_CalcGibbsProbY_ori(const int index, const IntegerMatrix& counts, const IntegerMatrix& nTSbyC, const IntegerVector& nbyTS, @@ -120,3 +120,128 @@ NumericVector cG_CalcGibbsProbY(const int index, return(probs); } + +// [[Rcpp::export]] +NumericVector cG_CalcGibbsProbY_fastRow(const int index, + const IntegerMatrix& counts, + const IntegerMatrix& nTSbyC, + const IntegerVector& nbyTS, + const IntegerVector& nGbyTS, + const IntegerVector& nbyG, + const IntegerVector& y, + const int L, + const int nG, + const NumericVector& lg_beta, + const NumericVector& lg_gamma, + const NumericVector& lg_delta, + const double delta) { + + int index0 = index - 1; + int current_y = y[index0] - 1; + int i; + + NumericVector probs(L); + + // Calculate probabilities related to the "n.TS.by.C" part of equation one time up front + // The first case of if statement represents when the current feature is already added to that module + // The second case represents when the current feature is NOT YET added to that module + for (i = 0; i < L; i++) { + if (i == current_y ) { + for (int col = 0; col < counts.ncol(); col++) { + probs[i] += lg_beta[nTSbyC(i, col)]; + probs[i] -= lg_beta[nTSbyC(i, col) - counts(index0, col)]; + } + } else { + for (int col = 0; col < counts.ncol(); col++) { + probs[i] += lg_beta[nTSbyC(i, col) + counts(index0, col)]; + probs[i] -= lg_beta[nTSbyC(i, col)]; + } + } + } + + // Calculate the probabilities for each module + // If statements determine whether to add or subtract counts from each probability + for (i = 0; i < L; i++) { + if (i == current_y) { + probs[i] += lg_gamma[nGbyTS[i]]; + probs[i] -= lg_gamma[nGbyTS[i] - 1]; + probs[i] += lg_delta[nGbyTS[i]]; + probs[i] -= lg_delta[nGbyTS[i] - 1]; + probs[i] += lgamma(nbyTS[i] - nbyG[index0] + (nGbyTS[i] - 1) * delta); + probs[i] -= lgamma(nbyTS[i] + nGbyTS[i] * delta); + } else { + probs[i] += lg_gamma[nGbyTS[i] + 1]; + probs[i] -= lg_gamma[nGbyTS[i]]; + probs[i] += lg_delta[nGbyTS[i] + 1]; + probs[i] -= lg_delta[nGbyTS[i]]; + probs[i] += lgamma(nbyTS[i] + nGbyTS[i] * delta); + probs[i] -= lgamma(nbyTS[i] + nbyG[index0] + (nGbyTS[i] + 1) * delta); + } + } + + return(probs); +} + + +// [[Rcpp::export]] +NumericVector cG_CalcGibbsProbY(const int index, + const IntegerMatrix& counts, + const IntegerMatrix& nTSbyC, + const IntegerVector& nbyTS, + const IntegerVector& nGbyTS, + const IntegerVector& nbyG, + const IntegerVector& y, + const int L, + const int nG, + const NumericVector& lg_beta, + const NumericVector& lg_gamma, + const NumericVector& lg_delta, + const double delta) { + + int index0 = index - 1; + int current_y = y[index0] - 1; + int i; + int j,k; + + NumericVector probs(L); + + // Calculate probabilities related to the "n.TS.by.C" part of equation one time up front + // The first case of if statement represents when the current feature is already added to that module + // The second case represents when the current feature is NOT YET added to that module + for (int col = 0; col < counts.ncol(); col++) { + k = col * nG + index0; // Index for the current feature in counts matrix + for (i = 0; i < L; i++) { + j = col * L + i; // Index for the current module in the n.TS.by.C matrix + if (i == current_y) { + probs[i] += lg_beta[nTSbyC[j]]; + probs[i] -= lg_beta[nTSbyC[j] - counts[k]]; + } else { + probs[i] += lg_beta[nTSbyC[j] + counts[k]]; + probs[i] -= lg_beta[nTSbyC[j]]; + } + } + } + + + // Calculate the probabilities for each module + // If statements determine whether to add or subtract counts from each probability + for (i = 0; i < L; i++) { + if (i == current_y) { + probs[i] += lg_gamma[nGbyTS[i]]; + probs[i] -= lg_gamma[nGbyTS[i] - 1]; + probs[i] += lg_delta[nGbyTS[i]]; + probs[i] -= lg_delta[nGbyTS[i] - 1]; + probs[i] += lgamma(nbyTS[i] - nbyG[index0] + (nGbyTS[i] - 1) * delta); + probs[i] -= lgamma(nbyTS[i] + nGbyTS[i] * delta); + } else { + probs[i] += lg_gamma[nGbyTS[i] + 1]; + probs[i] -= lg_gamma[nGbyTS[i]]; + probs[i] += lg_delta[nGbyTS[i] + 1]; + probs[i] -= lg_delta[nGbyTS[i]]; + probs[i] += lgamma(nbyTS[i] + nGbyTS[i] * delta); + probs[i] -= lgamma(nbyTS[i] + nbyG[index0] + (nGbyTS[i] + 1) * delta); + } + } + + return(probs); +} diff --git a/tests/testthat/test-decon.R b/tests/testthat/test-decon.R index d62b9745..f5baa4ba 100644 --- a/tests/testthat/test-decon.R +++ b/tests/testthat/test-decon.R @@ -3,14 +3,9 @@ library(celda) context("Testing Deconx") deconSim <- simulateContaminatedMatrix(K = 10, delta = c(1, 5)) -modelDecontXoneBatch <- .decontXoneBatch(deconSim$observedCounts, +modelDecontXoneBatch <- decontX(deconSim$observedCounts, z = deconSim$z, maxIter = 2) -modelDecontXoneBatchIter1 <- .decontXoneBatch(deconSim$observedCounts, - z = deconSim$z, - maxIter = 1) -modelDecontXoneBatchbg <- decontX(deconSim$observedCounts, - maxIter = 2) deconSim2 <- simulateContaminatedMatrix(K = 10, delta = 5) batchDecontX <- decontX(cbind(deconSim$observedCounts, @@ -18,10 +13,6 @@ batchDecontX <- decontX(cbind(deconSim$observedCounts, z = c(deconSim$z, deconSim2$z), batch = rep(seq(2), each = ncol(deconSim$observedCounts)), maxIter = 2) -batchDecontXBg <- decontX(cbind(deconSim$observedCounts, - deconSim2$observedCounts), - batch = rep(seq(2), each = ncol(deconSim$observedCounts)), - maxIter = 2) ## simulateContaminatedMatrix test_that(desc = "Testing simulateContaminatedMatrix", { @@ -35,53 +26,37 @@ test_that(desc = "Testing simulateContaminatedMatrix", { }) ## DecontX -test_that(desc = "Testing DecontX", { - expect_equal(ncol(deconSim$observedCounts) + ncol(deconSim2$observedCounts), - ncol(batchDecontX$resList$estNativeCounts)) - # expect_equal(length(batchDecontX$resList$estConp) , - # ncol(batchDecontX$resList$estNativeCounts)) - expect_equal(batchDecontXBg$method, "background") -}) ## .decontXoneBatch test_that(desc = "Testing .decontXoneBatch", { - expect_equal(modelDecontXoneBatch$resList$estConp, - 1 - colSums(modelDecontXoneBatch$resList$estNativeCounts) / - colSums(deconSim$observedCounts)) - expect_error(.decontXoneBatch(counts = deconSim$observedCounts, + expect_error(decontX(x = deconSim$observedCounts, z = deconSim$z, delta = -1), "'delta' should be a single positive value.") - expect_error(.decontXoneBatch(counts = deconSim$observedCounts, + expect_error(decontX(x = deconSim$observedCounts, z = deconSim$z, delta = c(1, 1)), "'delta' should be a single positive value.") - expect_error(.decontXoneBatch(counts = deconSim$observedCounts, + expect_error(decontX(x = deconSim$observedCounts, z = c(deconSim$z, 1)), paste0("'z' must be of the same length as the number of cells in the", " 'counts' matrix.")) expect_error(.decontXoneBatch(counts = deconSim$observedCounts, z = rep(1, ncol( deconSim$observedCounts))), - "'z' must have at least 2 different values.") + "No need to decontaminate when only one cluster is in the dataset.") countsNA <- deconSim$observedCounts countsNA[1, 1] <- NA expect_error(.decontXoneBatch(counts = countsNA, z = deconSim$z), "Missing value in 'counts' matrix.") }) -test_that(desc = "Testing .decontXoneBatch using background distribution", { - expect_equal( - modelDecontXoneBatchbg$resList$estConp, - 1 - colSums(modelDecontXoneBatchbg$resList$estNativeCounts) / - deconSim$NByC) -}) ## logLikelihood #test_that(desc = "Testing logLikelihood.DecontXoneBatch", { # z.process = processCellLabels(deconSim$z, # num.cells=ncol(deconSim$observedCounts) ) - # expect_equal( decon.calcLL(counts=deconSim$observedCounts, z=z.process , + # expect_equal( decon.calcLL(x=deconSim$observedCounts, z=z.process , # theta=modelDecontXoneBatch$resList$theta, # eta=modelDecontXoneBatch$resList$est.ConDist, # phi=modelDecontXoneBatch$resList$est.GeneDist ), diff --git a/vignettes/DecontX-analysis.Rmd b/vignettes/DecontX-analysis.Rmd index 3e8b6ed9..9d7061a0 100644 --- a/vignettes/DecontX-analysis.Rmd +++ b/vignettes/DecontX-analysis.Rmd @@ -67,6 +67,7 @@ colSums(simCounts$phi) colSums(simCounts$eta) ``` + # Decontamination using DecontX DecontX uses bayesian method to estimate and remove contamination via varitaional inference. diff --git a/vignettes/buildTreeHybrid-analysis.Rmd b/vignettes/FindMarkers-analysis.Rmd similarity index 76% rename from vignettes/buildTreeHybrid-analysis.Rmd rename to vignettes/FindMarkers-analysis.Rmd index 514217d9..92943a12 100644 --- a/vignettes/buildTreeHybrid-analysis.Rmd +++ b/vignettes/FindMarkers-analysis.Rmd @@ -15,7 +15,7 @@ vignette: > **CE**llular **L**atent **D**irichlet **A**llocation (celda), is among multiple approaches to cluster cells from single-cell RNA-seq data into discrete sub-populations. In most cases, annotating a concise set of important features for distinguishing these sub-populations is not a trivial task. -In this vignette we will demonstrate the implementation of a multiclass decision tree approach to simultaneously sort and annotate cell cluster label estimations by generating a sequence of univariate rules for each cluster. +In this vignette we will demonstrate the implementation of a multiclass decision tree approach to simultaneously sort and annotate cell cluster label estimations by generating a sequence of univariate rules for each cluster. The procedure has two main deviations from simple multiclass decision tree procedures. First, at each split cells from the same cluster are never separated during tree building. Instead cells from the same population are moved to one-side of a particular split based on majority voting. Second, each cluster split can be determined by one of two heuristics, as follows... @@ -63,7 +63,7 @@ cm <- celda_CG(counts = counts, L = 10, K = 10, verbose = FALSE) # Format celda output for decision tree generation -The decision trees are generated and evaluated by `buildTreeHybrid`. `buildTreeHybrid` requires two arguments: **features** and **class**. **features** is a numeric matrix with a row for every variable (or feature) to be used in sorting and a column for every sample. In this example we will use a factorized matrix of the gene clusters as our feature matrix (Check `?factorizedMatrix` for more details). **class** is a vector with a label assignment for every sample. Note, that neither **features** nor **class** may have any missing values. +The decision trees are generated and evaluated by `findMarkers`. `findMarkers` requires two arguments: **features** and **class**. **features** is a numeric matrix with a row for every variable (or feature) to be used in sorting and a column for every sample. In this example we will use a factorized matrix of the gene clusters as our feature matrix (Check `?factorizedMatrix` for more details). **class** is a vector with a label assignment for every sample. Note, that neither **features** nor **class** may have any missing values. ```{r, eval = TRUE, message = FALSE} # Get features matrix and cluster assignments @@ -74,7 +74,7 @@ class <- clusters(cm)$z # Generate celda decision tree -`buildTreeHybrid` allows for different parameter options. The optimal combination of these parameters is generally analysis specific and may require some trial and error. Parameters include: +`findMarkers` allows for different parameter options. The optimal combination of these parameters is generally analysis specific and may require some trial and error. Parameters include: - **oneoffMetric** A selection for one of two possible metrics for evaluating good one-off splits for segregating a single cluster from every other cluster based on up-regulation of that cluster alone. The performance of either is a value between 0 and 1. The two options include: - *modified F1* - Finds the univariate split that maximizes the harmonic mean of the sensitivity and precision of the up-regulated cluster, as well as the minimum cluster-specific sensitivity across all down-regulated clusters. @@ -86,18 +86,18 @@ class <- clusters(cm)$z ```{r, eval = TRUE, message = FALSE} -DecTree <- buildTreeHybrid(features, - class, - oneoffMetric = "modified F1", - threshold = 0.95, - reuseFeatures = FALSE, - altSplit = TRUE, - consecutiveOneoff = FALSE) +DecTree <- findMarkers(features, + class, + oneoffMetric = "modified F1", + threshold = 0.95, + reuseFeatures = FALSE, + altSplit = TRUE, + consecutiveOneoff = FALSE) ``` -## `buildTreeHybrid` output +## `findMarkers` output -The `buildTreeHybrid` output is a named list of five elements +The `findMarkers` output is a named list of four elements - **rules** A named list with one `data.frame` for every label. Each `data.frame` has five columns and gives the set of rules for distinguishing @@ -114,17 +114,6 @@ The `buildTreeHybrid` output is a named list of five elements level of the first split of the tree. - **dendro** A dendrogram object of the decision tree output -- **summaryMatrix** A K(labels) by L(features) matrix representation of - the decision rules. Columns denote features and rows denote labels. Non-0 - values denote instances where a feature was used on a given label. Positive - and negative values denote whether the values of the label for that feature - were greater than or less than the decision threshold, respectively. The - magnitude of Non-0 values denote the level at which the feature was used, - where the first split has a magnitude of 1. Note, if reuse_features = TRUE, - only the final usage of a feature for a given label is shown. - \item prediction** A character vector of label of predictions of the - training data using the final model. "MISSING" if label prediction was - ambiguous. - **performance** A named list denoting the training performance of the - *accuracy* - (number correct/number of samples) for the whole set of @@ -142,11 +131,11 @@ The `buildTreeHybrid` output is a named list of five elements ```{r, eval = TRUE, message = FALSE} plotDendro(DecTree, - classLabel = NULL, - addSensPrec = TRUE, - leafSize = 24, - boxSize = 7, - boxColor = "black") + classLabel = NULL, + addSensPrec = TRUE, + leafSize = 24, + boxSize = 7, + boxColor = "black") ``` In the plot, the feature(s) used for splits determined by the one-off metric are printed above the cluster labels for which they are markers. Alternatively, the features used for the balanced splits are centered above the two sets of clusters defined by that split. The up-regulated set of clusters for a balanced split are one the right side of that split. The size, sensitivitie and precision of each class are printed below the leaf labels, respectively. @@ -156,21 +145,39 @@ In `plotDendro` the **classLabel** argument may used to only print the sequence ```{r, eval = TRUE, message = FALSE} plotDendro(DecTree, - classLabel = "1", - addSensPrec = TRUE, - leafSize = 15, - boxSize = 7, - boxColor = "black") + classLabel = "1", + addSensPrec = TRUE, + leafSize = 15, + boxSize = 7, + boxColor = "black") ``` # Get label estimates from features matrix -`buildTreeHybrid` performs label estimation of each sample in the training set automatically. You can use the set of rules generated by `buildTreeHybrid` to predict the labels on an independent feature matrix using `getDecisions`. +`findMarkers` performs label estimation of each sample in the training set automatically. You can use the set of rules generated by `findMarkers` to predict the labels on an independent feature matrix using `getDecisions`. ```{r, eval = TRUE, message = FALSE} head(getDecisions(DecTree$rules, features)) ``` +# Create decision tree with meta clusters + +If you have a priori understanding of sub-groups of your cluster labels, you can ensure that these sub-groups are not separated up-stream in the tree by using the optional *cellTypes* argument. This is just a named list of labels in your *class* vector. For example, if we knew that clusters, 4, 5, and 1 were of the same subtype of clusters, we could do the following... + +```{r, eval = TRUE, message = FALSE} +# Run with a hierarchichal split +cellTypes <- list(metaLabel = c("4", "5", "1")) +DecTreeMeta <- findMarkers(features, + class, + cellTypes, + oneoffMetric = "modified F1", + threshold = 1, + reuseFeatures = F, + consecutiveOneoff = FALSE) +plotDendro(DecTreeMeta) +``` + + # Session Information ```{r}